Files

507 lines
14 KiB
JavaScript
Raw Permalink Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
import { asciiPrefix, decodeAscii, decodeLatin1, decodeUCS2 } from './latin1.js'
import { getTable } from './multi-byte.table.js'
export const E_STRICT = 'Input is not well-formed for this encoding'
// TODO: optimize
// If the decoder is not cleared properly, state can be preserved between non-streaming calls!
// See comment about fatal stream
// Common between euc-kr and big5
function bigDecoder(err, pair) {
let lead = 0
let oi = 0
let o16
const decodeLead = (b) => {
const p = pair(lead, b)
lead = 0
if (typeof p === 'number') {
o16[oi++] = p
} else if (p) {
// This is still faster than string concatenation. Can we optimize strings though?
for (let i = 0; i < p.length; i++) o16[oi++] = p.charCodeAt(i)
} else {
o16[oi++] = err()
if (b < 128) o16[oi++] = b
}
}
const decode = (arr, start, end, stream) => {
let i = start
o16 = new Uint16Array(end - start + (lead ? 1 : 0)) // there are pairs but they consume more than one byte
oi = 0
if (lead && i < end) decodeLead(arr[i++])
while (i < end) {
const b = arr[i++]
if (b < 128) {
o16[oi++] = b
} else if (b === 0x80 || b === 0xff) {
o16[oi++] = err()
} else {
lead = b
if (i < end) decodeLead(arr[i++])
}
}
if (lead && !stream) {
lead = 0
o16[oi++] = err()
}
const res = decodeUCS2(o16, oi)
o16 = null
return res
}
return { decode, isAscii: () => lead === 0 }
}
// All except iso-2022-jp are ASCII supersets
// When adding something that is not an ASCII superset, ajust the ASCII fast path
const REP = 0xff_fd
const mappers = {
// https://encoding.spec.whatwg.org/#euc-kr-decoder
'euc-kr': (err) => {
const euc = getTable('euc-kr')
return bigDecoder(err, (l, b) => {
if (b < 0x41 || b > 0xfe) return
const cp = euc[(l - 0x81) * 190 + b - 0x41]
return cp !== undefined && cp !== REP ? cp : undefined
})
},
// https://encoding.spec.whatwg.org/#euc-jp-decoder
'euc-jp': (err) => {
const jis0208 = getTable('jis0208')
const jis0212 = getTable('jis0212')
let j12 = false
let lead = 0
let oi = 0
let o16
const decodeLead = (b) => {
if (lead === 0x8e && b >= 0xa1 && b <= 0xdf) {
lead = 0
o16[oi++] = 0xfe_c0 + b
} else if (lead === 0x8f && b >= 0xa1 && b <= 0xfe) {
j12 = true
lead = b
} else {
let cp
if (lead >= 0xa1 && lead <= 0xfe && b >= 0xa1 && b <= 0xfe) {
cp = (j12 ? jis0212 : jis0208)[(lead - 0xa1) * 94 + b - 0xa1]
}
lead = 0
j12 = false
if (cp !== undefined && cp !== REP) {
o16[oi++] = cp
} else {
o16[oi++] = err()
if (b < 128) o16[oi++] = b
}
}
}
const decode = (arr, start, end, stream) => {
let i = start
o16 = new Uint16Array(end - start + (lead ? 1 : 0))
oi = 0
if (lead && i < end) decodeLead(arr[i++])
if (lead && i < end) decodeLead(arr[i++]) // could be two leads, but no more
while (i < end) {
const b = arr[i++]
if (b < 128) {
o16[oi++] = b
} else if ((b < 0xa1 && b !== 0x8e && b !== 0x8f) || b === 0xff) {
o16[oi++] = err()
} else {
lead = b
if (i < end) decodeLead(arr[i++])
if (lead && i < end) decodeLead(arr[i++]) // could be two leads
}
}
if (lead && !stream) {
lead = 0
j12 = false // can be true only when lead is non-zero
o16[oi++] = err()
}
const res = decodeUCS2(o16, oi)
o16 = null
return res
}
return { decode, isAscii: () => lead === 0 } // j12 can be true only when lead is non-zero
},
// https://encoding.spec.whatwg.org/#iso-2022-jp-decoder
'iso-2022-jp': (err) => {
const jis0208 = getTable('jis0208')
let dState = 1
let oState = 1
let lead = 0 // 0 or 0x21-0x7e
let out = false
const bytes = (pushback, b) => {
if (dState < 5 && b === 0x1b) {
dState = 6 // escape start
return
}
switch (dState) {
case 1:
case 2:
// ASCII, Roman (common)
out = false
if (dState === 2) {
if (b === 0x5c) return 0xa5
if (b === 0x7e) return 0x20_3e
}
if (b <= 0x7f && b !== 0x0e && b !== 0x0f) return b
return err()
case 3:
// Katakana
out = false
if (b >= 0x21 && b <= 0x5f) return 0xff_40 + b
return err()
case 4:
// Leading byte
out = false
if (b < 0x21 || b > 0x7e) return err()
lead = b
dState = 5
return
case 5:
// Trailing byte
out = false
if (b === 0x1b) {
dState = 6 // escape start
return err()
}
dState = 4
if (b >= 0x21 && b <= 0x7e) {
const cp = jis0208[(lead - 0x21) * 94 + b - 0x21]
if (cp !== undefined && cp !== REP) return cp
}
return err()
case 6:
// Escape start
if (b === 0x24 || b === 0x28) {
lead = b
dState = 7
return
}
out = false
dState = oState
pushback.push(b)
return err()
case 7: {
// Escape
const l = lead
lead = 0
let s
if (l === 0x28) {
// eslint-disable-next-line unicorn/prefer-switch
if (b === 0x42) {
s = 1
} else if (b === 0x4a) {
s = 2
} else if (b === 0x49) {
s = 3
}
} else if (l === 0x24 && (b === 0x40 || b === 0x42)) {
s = 4
}
if (s) {
dState = oState = s
const output = out
out = true
return output ? err() : undefined
}
out = false
dState = oState
pushback.push(b, l)
return err()
}
}
}
const eof = (pushback) => {
if (dState < 5) return null
out = false
switch (dState) {
case 5:
dState = 4
return err()
case 6:
dState = oState
return err()
case 7: {
dState = oState
pushback.push(lead)
lead = 0
return err()
}
}
}
const decode = (arr, start, end, stream) => {
const o16 = new Uint16Array(end - start + 2) // err in eof + lead from state
let oi = 0
let i = start
const pushback = [] // local and auto-cleared
// First, dump everything until EOF
// Same as the full loop, but without EOF handling
while (i < end || pushback.length > 0) {
const c = bytes(pushback, pushback.length > 0 ? pushback.pop() : arr[i++])
if (c !== undefined) o16[oi++] = c // 16-bit
}
// Then, dump EOF. This needs the same loop as the characters can be pushed back
if (!stream) {
while (i <= end || pushback.length > 0) {
if (i < end || pushback.length > 0) {
const c = bytes(pushback, pushback.length > 0 ? pushback.pop() : arr[i++])
if (c !== undefined) o16[oi++] = c // 16-bit
} else {
const c = eof(pushback)
if (c === null) break // clean exit
o16[oi++] = c
}
}
}
// Chrome and WebKit fail on this, we don't: completely destroy the old decoder state when finished streaming
// > If thiss do not flush is false, then set thiss decoder to a new instance of thiss encodings decoder,
// > Set thiss do not flush to options["stream"]
if (!stream) {
dState = oState = 1
lead = 0
out = false
}
return decodeUCS2(o16, oi)
}
return { decode, isAscii: () => false }
},
// https://encoding.spec.whatwg.org/#shift_jis-decoder
shift_jis: (err) => {
const jis0208 = getTable('jis0208')
let lead = 0
let oi = 0
let o16
const decodeLead = (b) => {
const l = lead
lead = 0
if (b >= 0x40 && b <= 0xfc && b !== 0x7f) {
const p = (l - (l < 0xa0 ? 0x81 : 0xc1)) * 188 + b - (b < 0x7f ? 0x40 : 0x41)
if (p >= 8836 && p <= 10_715) {
o16[oi++] = 0xe0_00 - 8836 + p
return
}
const cp = jis0208[p]
if (cp !== undefined && cp !== REP) {
o16[oi++] = cp
return
}
}
o16[oi++] = err()
if (b < 128) o16[oi++] = b
}
const decode = (arr, start, end, stream) => {
o16 = new Uint16Array(end - start + (lead ? 1 : 0))
oi = 0
let i = start
if (lead && i < end) decodeLead(arr[i++])
while (i < end) {
const b = arr[i++]
if (b <= 0x80) {
o16[oi++] = b // 0x80 is allowed
} else if (b >= 0xa1 && b <= 0xdf) {
o16[oi++] = 0xfe_c0 + b
} else if (b === 0xa0 || b > 0xfc) {
o16[oi++] = err()
} else {
lead = b
if (i < end) decodeLead(arr[i++])
}
}
if (lead && !stream) {
lead = 0
o16[oi++] = err()
}
const res = decodeUCS2(o16, oi)
o16 = null
return res
}
return { decode, isAscii: () => lead === 0 }
},
// https://encoding.spec.whatwg.org/#gbk-decoder
gbk: (err) => mappers.gb18030(err), // 10.1.1. GBKs decoder is gb18030s decoder
// https://encoding.spec.whatwg.org/#gb18030-decoder
gb18030: (err) => {
const gb18030 = getTable('gb18030')
const gb18030r = getTable('gb18030-ranges')
let g1 = 0, g2 = 0, g3 = 0 // prettier-ignore
const index = (p) => {
if ((p > 39_419 && p < 189_000) || p > 1_237_575) return
if (p === 7457) return 0xe7_c7
let a = 0, b = 0 // prettier-ignore
for (const [c, d] of gb18030r) {
if (c > p) break
a = c
b = d
}
return b + p - a
}
// g1 is 0 or 0x81-0xfe
// g2 is 0 or 0x30-0x39
// g3 is 0 or 0x81-0xfe
const decode = (arr, start, end, stream) => {
const o16 = new Uint16Array(end - start + (g1 ? 3 : 0)) // even with pushback it's at most 1 char per byte
let oi = 0
let i = start
const pushback = [] // local and auto-cleared
// First, dump everything until EOF
// Same as the full loop, but without EOF handling
while (i < end || pushback.length > 0) {
const b = pushback.length > 0 ? pushback.pop() : arr[i++]
if (g1) {
// g2 can be set only when g1 is set, g3 can be set only when g2 is set
// hence, 3 checks for g3 is faster than 3 checks for g1
if (g2) {
if (g3) {
if (b < 0x30 || b > 0x39) {
pushback.push(b, g3, g2)
g1 = g2 = g3 = 0
o16[oi++] = err()
} else {
const p = index(
(g1 - 0x81) * 12_600 + (g2 - 0x30) * 1260 + (g3 - 0x81) * 10 + b - 0x30
)
g1 = g2 = g3 = 0
if (p === undefined) {
o16[oi++] = err()
} else if (p <= 0xff_ff) {
o16[oi++] = p // Can validly return replacement
} else {
const d = p - 0x1_00_00
o16[oi++] = 0xd8_00 | (d >> 10)
o16[oi++] = 0xdc_00 | (d & 0x3_ff)
}
}
} else if (b >= 0x81 && b <= 0xfe) {
g3 = b
} else {
pushback.push(b, g2)
g1 = g2 = 0
o16[oi++] = err()
}
} else if (b >= 0x30 && b <= 0x39) {
g2 = b
} else {
let cp
if (b >= 0x40 && b <= 0xfe && b !== 0x7f) {
cp = gb18030[(g1 - 0x81) * 190 + b - (b < 0x7f ? 0x40 : 0x41)]
}
g1 = 0
if (cp !== undefined && cp !== REP) {
o16[oi++] = cp // 16-bit
} else {
o16[oi++] = err()
if (b < 128) o16[oi++] = b // can be processed immediately
}
}
} else if (b < 128) {
o16[oi++] = b
} else if (b === 0x80) {
o16[oi++] = 0x20_ac
} else if (b === 0xff) {
o16[oi++] = err()
} else {
g1 = b
}
}
// if g1 = 0 then g2 = g3 = 0
if (g1 && !stream) {
g1 = g2 = g3 = 0
o16[oi++] = err()
}
return decodeUCS2(o16, oi)
}
return { decode, isAscii: () => g1 === 0 } // if g1 = 0 then g2 = g3 = 0
},
// https://encoding.spec.whatwg.org/#big5
big5: (err) => {
// The only decoder which returns multiple codepoints per byte, also has non-charcode codepoints
// We store that as strings
const big5 = getTable('big5')
return bigDecoder(err, (l, b) => {
if (b < 0x40 || (b > 0x7e && b < 0xa1) || b === 0xff) return
return big5[(l - 0x81) * 157 + b - (b < 0x7f ? 0x40 : 0x62)] // strings
})
},
}
export const isAsciiSuperset = (enc) => enc !== 'iso-2022-jp' // all others are ASCII supersets and can use fast path
export function multibyteDecoder(enc, loose = false) {
if (typeof loose !== 'boolean') throw new TypeError('loose option should be boolean')
if (!Object.hasOwn(mappers, enc)) throw new RangeError('Unsupported encoding')
// Input is assumed to be typechecked already
let mapper
const asciiSuperset = isAsciiSuperset(enc)
let streaming // because onErr is cached in mapper
const onErr = loose
? () => REP
: () => {
// The correct way per spec seems to be not destoying the decoder state in stream mode, even when fatal
// Decoders big5, euc-jp, euc-kr, shift_jis, gb18030 / gbk - all clear state before throwing unless EOF, so not affected
// iso-2022-jp is the only tricky one one where this !stream check matters in non-stream mode
if (!streaming) mapper = null // destroy state, effectively the same as 'do not flush' = false, but early
throw new TypeError(E_STRICT)
}
return (arr, stream = false) => {
let res = ''
if (asciiSuperset && (!mapper || mapper.isAscii?.())) {
const prefixLen = asciiPrefix(arr)
if (prefixLen === arr.length) return decodeAscii(arr) // ascii
res = decodeLatin1(arr, 0, prefixLen) // TODO: check if decodeAscii with subarray is faster for small prefixes too
}
streaming = stream // affects onErr
if (!mapper) mapper = mappers[enc](onErr)
return res + mapper.decode(arr, res.length, arr.length, stream)
}
}