122 lines
3.5 KiB
JavaScript
122 lines
3.5 KiB
JavaScript
import { fromBase64url } from '@exodus/bytes/base64.js'
|
|
import { utf16toString } from '@exodus/bytes/utf16.js'
|
|
import loadEncodings from './multi-byte.encodings.cjs'
|
|
import { to16input } from './utf16.js'
|
|
|
|
export const sizes = {
|
|
jis0208: 11_104,
|
|
jis0212: 7211,
|
|
'euc-kr': 23_750,
|
|
gb18030: 23_940,
|
|
big5: 19_782,
|
|
}
|
|
|
|
// This is huge. It's _much_ smaller than https://npmjs.com/text-encoding though
|
|
// Exactly as mapped by the index table
|
|
// 0,x - hole of x empty elements
|
|
// n,c - continious [c, ...] of length n
|
|
// $.. - references to common chunks
|
|
// -{x} - same as 1,{x}
|
|
|
|
// See tests/multi-byte.test.js to verify that this data decodes exactly into the encoding spec tables
|
|
|
|
let indices
|
|
const tables = new Map()
|
|
/* eslint-disable @exodus/mutable/no-param-reassign-prop-only */
|
|
|
|
function loadBase64(str) {
|
|
const x = fromBase64url(str)
|
|
const len = x.length
|
|
const len2 = len >> 1
|
|
const y = new Uint8Array(len)
|
|
let a = -1, b = 0 // prettier-ignore
|
|
for (let i = 0, j = 0; i < len; i += 2, j++) {
|
|
a = (a + x[j] + 1) & 0xff
|
|
b = (b + x[len2 + j]) & 0xff
|
|
y[i] = a
|
|
y[i + 1] = b
|
|
}
|
|
|
|
return y
|
|
}
|
|
|
|
function unwrap(res, t, pos, stringMode = false) {
|
|
let code = 0
|
|
for (let i = 0; i < t.length; i++) {
|
|
let x = t[i]
|
|
if (typeof x === 'number') {
|
|
if (x === 0) {
|
|
pos += t[++i]
|
|
} else {
|
|
if (x < 0) {
|
|
code -= x
|
|
x = 1
|
|
} else {
|
|
code += t[++i]
|
|
}
|
|
|
|
if (stringMode) {
|
|
for (let k = 0; k < x; k++, pos++, code++) {
|
|
res[pos] = code <= 0xff_ff ? code : String.fromCodePoint(code)
|
|
}
|
|
} else {
|
|
for (let k = 0; k < x; k++, pos++, code++) res[pos] = code
|
|
}
|
|
}
|
|
} else if (x[0] === '$' && Object.hasOwn(indices, x)) {
|
|
pos = unwrap(res, indices[x], pos, stringMode) // self-reference using shared chunks
|
|
} else if (stringMode) {
|
|
const s = [...utf16toString(loadBase64(x), 'uint8-le')] // splits by codepoints
|
|
let char
|
|
for (let i = 0; i < s.length; ) {
|
|
char = s[i++]
|
|
res[pos++] = char.length === 1 ? char.charCodeAt(0) : char // strings only for high codepoints
|
|
}
|
|
|
|
code = char.codePointAt(0) + 1
|
|
} else {
|
|
const u16 = to16input(loadBase64(x), true) // data is little-endian
|
|
res.set(u16, pos)
|
|
pos += u16.length
|
|
code = u16[u16.length - 1] + 1
|
|
}
|
|
}
|
|
|
|
return pos
|
|
}
|
|
|
|
export function getTable(id) {
|
|
const cached = tables.get(id)
|
|
if (cached) return cached
|
|
|
|
if (!indices) indices = loadEncodings() // lazy-load
|
|
if (!Object.hasOwn(indices, id)) throw new Error('Unknown encoding')
|
|
if (!indices[id]) throw new Error('Table already used (likely incorrect bundler dedupe)')
|
|
|
|
let res
|
|
if (id.endsWith('-ranges')) {
|
|
res = []
|
|
let a = 0, b = 0 // prettier-ignore
|
|
const idx = indices[id]
|
|
while (idx.length > 0) res.push([(a += idx.shift()), (b += idx.shift())]) // destroying, we remove it later anyway
|
|
} else if (id === 'big5') {
|
|
if (!Object.hasOwn(sizes, id)) throw new Error('Unknown encoding')
|
|
res = new Array(sizes[id]) // array of strings or undefined
|
|
unwrap(res, indices[id], 0, true)
|
|
// Pointer code updates are embedded into the table
|
|
res[1133] = '\xCA\u0304'
|
|
res[1135] = '\xCA\u030C'
|
|
res[1164] = '\xEA\u0304'
|
|
res[1166] = '\xEA\u030C'
|
|
} else {
|
|
if (!Object.hasOwn(sizes, id)) throw new Error('Unknown encoding')
|
|
res = new Uint16Array(sizes[id])
|
|
res.fill(0xff_fd)
|
|
unwrap(res, indices[id], 0, false)
|
|
}
|
|
|
|
indices[id] = null // gc
|
|
tables.set(id, res)
|
|
return res
|
|
}
|