180 lines
5.1 KiB
Go
180 lines
5.1 KiB
Go
|
// Copyright 2013 The Go Authors. All rights reserved.
|
||
|
// Use of this source code is governed by a BSD-style
|
||
|
// license that can be found in the LICENSE file.
|
||
|
|
||
|
// Package encoding defines an interface for character encodings, such as Shift
|
||
|
// JIS and Windows 1252, that can convert to and from UTF-8.
|
||
|
//
|
||
|
// To convert the bytes of an io.Reader r from the encoding e to UTF-8:
|
||
|
// rInUTF8 := transform.NewReader(r, e.NewDecoder())
|
||
|
// and to convert from UTF-8 to the encoding e:
|
||
|
// wInUTF8 := transform.NewWriter(w, e.NewEncoder())
|
||
|
// In both cases, import "golang.org/x/text/transform".
|
||
|
//
|
||
|
// Encoding implementations are provided in other packages, such as
|
||
|
// golang.org/x/text/encoding/charmap and
|
||
|
// golang.org/x/text/encoding/japanese.
|
||
|
package encoding // import "golang.org/x/text/encoding"
|
||
|
|
||
|
import (
|
||
|
"errors"
|
||
|
"unicode/utf8"
|
||
|
|
||
|
"golang.org/x/text/transform"
|
||
|
)
|
||
|
|
||
|
// Encoding is a character set encoding that can be transformed to and from
|
||
|
// UTF-8.
|
||
|
type Encoding interface {
|
||
|
// NewDecoder returns a transformer that converts to UTF-8.
|
||
|
//
|
||
|
// Transforming source bytes that are not of that encoding will not
|
||
|
// result in an error per se. Each byte that cannot be transcoded will
|
||
|
// be represented in the output by the UTF-8 encoding of '\uFFFD', the
|
||
|
// replacement rune.
|
||
|
NewDecoder() transform.Transformer
|
||
|
|
||
|
// NewEncoder returns a transformer that converts from UTF-8.
|
||
|
//
|
||
|
// Transforming source bytes that are not valid UTF-8 will not result in
|
||
|
// an error per se. Each rune that cannot be transcoded will be
|
||
|
// represented in the output by an encoding-specific replacement such as
|
||
|
// "\x1a" (the ASCII substitute character) or "\xff\xfd". To return
|
||
|
// early with error instead, use transform.Chain to preprocess the data
|
||
|
// with a UTF8Validator.
|
||
|
NewEncoder() transform.Transformer
|
||
|
}
|
||
|
|
||
|
// ASCIISub is the ASCII substitute character, as recommended by
|
||
|
// http://unicode.org/reports/tr36/#Text_Comparison
|
||
|
const ASCIISub = '\x1a'
|
||
|
|
||
|
// Nop is the nop encoding. Its transformed bytes are the same as the source
|
||
|
// bytes; it does not replace invalid UTF-8 sequences.
|
||
|
var Nop Encoding = nop{}
|
||
|
|
||
|
type nop struct{}
|
||
|
|
||
|
func (nop) NewDecoder() transform.Transformer {
|
||
|
return transform.Nop
|
||
|
}
|
||
|
|
||
|
func (nop) NewEncoder() transform.Transformer {
|
||
|
return transform.Nop
|
||
|
}
|
||
|
|
||
|
// Replacement is the replacement encoding. Decoding from the replacement
|
||
|
// encoding yields a single '\uFFFD' replacement rune. Encoding from UTF-8 to
|
||
|
// the replacement encoding yields the same as the source bytes except that
|
||
|
// invalid UTF-8 is converted to '\uFFFD'.
|
||
|
//
|
||
|
// It is defined at http://encoding.spec.whatwg.org/#replacement
|
||
|
var Replacement Encoding = replacement{}
|
||
|
|
||
|
type replacement struct{}
|
||
|
|
||
|
func (replacement) NewDecoder() transform.Transformer {
|
||
|
return replacementDecoder{}
|
||
|
}
|
||
|
|
||
|
func (replacement) NewEncoder() transform.Transformer {
|
||
|
return replacementEncoder{}
|
||
|
}
|
||
|
|
||
|
type replacementDecoder struct{ transform.NopResetter }
|
||
|
|
||
|
func (replacementDecoder) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
|
||
|
if len(dst) < 3 {
|
||
|
return 0, 0, transform.ErrShortDst
|
||
|
}
|
||
|
if atEOF {
|
||
|
const fffd = "\ufffd"
|
||
|
dst[0] = fffd[0]
|
||
|
dst[1] = fffd[1]
|
||
|
dst[2] = fffd[2]
|
||
|
nDst = 3
|
||
|
}
|
||
|
return nDst, len(src), nil
|
||
|
}
|
||
|
|
||
|
type replacementEncoder struct{ transform.NopResetter }
|
||
|
|
||
|
func (replacementEncoder) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
|
||
|
r, size := rune(0), 0
|
||
|
|
||
|
for ; nSrc < len(src); nSrc += size {
|
||
|
r = rune(src[nSrc])
|
||
|
|
||
|
// Decode a 1-byte rune.
|
||
|
if r < utf8.RuneSelf {
|
||
|
size = 1
|
||
|
|
||
|
} else {
|
||
|
// Decode a multi-byte rune.
|
||
|
r, size = utf8.DecodeRune(src[nSrc:])
|
||
|
if size == 1 {
|
||
|
// All valid runes of size 1 (those below utf8.RuneSelf) were
|
||
|
// handled above. We have invalid UTF-8 or we haven't seen the
|
||
|
// full character yet.
|
||
|
if !atEOF && !utf8.FullRune(src[nSrc:]) {
|
||
|
err = transform.ErrShortSrc
|
||
|
break
|
||
|
}
|
||
|
r = '\ufffd'
|
||
|
}
|
||
|
}
|
||
|
|
||
|
if nDst+utf8.RuneLen(r) > len(dst) {
|
||
|
err = transform.ErrShortDst
|
||
|
break
|
||
|
}
|
||
|
nDst += utf8.EncodeRune(dst[nDst:], r)
|
||
|
}
|
||
|
return nDst, nSrc, err
|
||
|
}
|
||
|
|
||
|
// ErrInvalidUTF8 means that a transformer encountered invalid UTF-8.
|
||
|
var ErrInvalidUTF8 = errors.New("encoding: invalid UTF-8")
|
||
|
|
||
|
// UTF8Validator is a transformer that returns ErrInvalidUTF8 on the first
|
||
|
// input byte that is not valid UTF-8.
|
||
|
var UTF8Validator transform.Transformer = utf8Validator{}
|
||
|
|
||
|
type utf8Validator struct{ transform.NopResetter }
|
||
|
|
||
|
func (utf8Validator) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
|
||
|
n := len(src)
|
||
|
if n > len(dst) {
|
||
|
n = len(dst)
|
||
|
}
|
||
|
for i := 0; i < n; {
|
||
|
if c := src[i]; c < utf8.RuneSelf {
|
||
|
dst[i] = c
|
||
|
i++
|
||
|
continue
|
||
|
}
|
||
|
_, size := utf8.DecodeRune(src[i:])
|
||
|
if size == 1 {
|
||
|
// All valid runes of size 1 (those below utf8.RuneSelf) were
|
||
|
// handled above. We have invalid UTF-8 or we haven't seen the
|
||
|
// full character yet.
|
||
|
err = ErrInvalidUTF8
|
||
|
if !atEOF && !utf8.FullRune(src[i:]) {
|
||
|
err = transform.ErrShortSrc
|
||
|
}
|
||
|
return i, i, err
|
||
|
}
|
||
|
if i+size > len(dst) {
|
||
|
return i, i, transform.ErrShortDst
|
||
|
}
|
||
|
for ; size > 0; size-- {
|
||
|
dst[i] = src[i]
|
||
|
i++
|
||
|
}
|
||
|
}
|
||
|
if len(src) > len(dst) {
|
||
|
err = transform.ErrShortDst
|
||
|
}
|
||
|
return n, n, err
|
||
|
}
|