Compare commits

...

4 Commits

Author SHA1 Message Date
qntm 2d7354aa6f
Travis YML (#25) 2020-08-14 20:30:22 +01:00
qntm ce00e429cd Fix gen ref 2020-08-14 18:28:02 +01:00
qntm 5c25609a63 Non-local paths 2020-08-14 18:25:56 +01:00
qntm 5dd407fd67 Roll in generator script 2020-08-14 18:25:32 +01:00
6 changed files with 139 additions and 2 deletions

5
.travis.yml Normal file
View File

@ -0,0 +1,5 @@
language: node_js
node_js:
- 10
- 12
- 14

View File

@ -224,7 +224,7 @@ Not yet.
To encode one additional bit per code point, we need to *double* the number of code points we use from 65,536 to 131,072. This would be a new encoding, [Base131072](https://github.com/qntm/base131072), and its UTF-32 encoding efficiency would be 53% vs. 50% for Base65536. (Note that in UTF-16, [Base32768](https://github.com/qntm/base32768) significantly outperforms either choice, and in UTF-8, Base64 remains the preferred choice.)
However, as of Unicode 10.0, [`base65536gen`](https://github.com/qntm/base65536gen) returns only 116,813 safe code points altogether. Perhaps future versions of Unicode will eventually assign more characters and make this possible, but even when this eventually happens, it seems unlikely that the characters will be arranged neatly in the blocks of 256 which make Base65536 so small and simple. It might not be worth the trouble...
However, as of Unicode 10.0, [`safe-code-point`](https://github.com/qntm/safe-code-point) returns only 116,813 safe code points altogether. Perhaps future versions of Unicode will eventually assign more characters and make this possible, but even when this eventually happens, it seems unlikely that the characters will be arranged neatly in the blocks of 256 which make Base65536 so small and simple. It might not be worth the trouble...
## License

8
package-lock.json generated
View File

@ -1,6 +1,6 @@
{
"name": "base65536",
"version": "3.0.2",
"version": "3.0.3",
"lockfileVersion": 1,
"requires": true,
"dependencies": {
@ -6774,6 +6774,12 @@
"integrity": "sha512-Gd2UZBJDkXlY7GbJxfsE8/nvKkUEU1G38c1siN6QP6a9PT9MmHB8GnpscSmMJSoF8LOIrt8ud/wPtojys4G6+g==",
"dev": true
},
"safe-code-point": {
"version": "1.0.0",
"resolved": "https://registry.npmjs.org/safe-code-point/-/safe-code-point-1.0.0.tgz",
"integrity": "sha512-kSSM+6Ks9RD0fPN+/WvArS3dGVBBpfvVQyyOmKZ91/ZZ8DykxJ9JrMFsR9YJgwKE1mHdgybmd1hyxuf/TWRWHg==",
"dev": true
},
"safe-regex": {
"version": "1.1.0",
"resolved": "https://registry.npmjs.org/safe-regex/-/safe-regex-1.1.0.tgz",

View File

@ -33,6 +33,7 @@
"glob": "^7.1.6",
"jest": "^25.1.0",
"rollup": "^1.31.0",
"safe-code-point": "^1.0.0",
"standard": "^14.3.3"
},
"babel": {

74
scripts/gen.js Normal file
View File

@ -0,0 +1,74 @@
// This module's purpose is to generate 65,792 "safe" <https://qntm.org/safe>
// Unicode code points suitable for use in the Base65536 encoding. It makes use
// of the sibling package `safe-code-point`
// <https://github.com/qntm/safe-code-point>.
// 65,792 is 2**16 + 2**8. Each code point in the initial collection of 2**16
// code points encodes 16 bits, i.e., a distinct possible pair of bytes. The
// extra 2**8 code points are needed to encode 8 bits of the single final byte
// in cases where the binary data runs for an odd number of bytes.
// Rather than generate thousands and thousands of arbitrary safe code points,
// this module simplifies matters by finding 256 + 1 contiguous, aligned blocks
// of 256 safe code points. This means the last 8 bits of the code point can be
// decoded directly to an encoded byte, and the lookup table to decode the other
// byte (if any) is relatively small.
// This program was run only once, with the successful results immediately
// transplanted into `base65536` for use. It is kept here for historical reasons
// and to ensure reproducibility.
import safeCodePoint, { generalCategory } from 'safe-code-point'
const safeRange = (min, max) => {
for (let codePoint = min; codePoint < max; codePoint++) {
// Code points were chosen entirely from the "Letter, other" general
// category, for reasons which I no longer recall. Unicode 8.0 was current
// at the time.
if (
generalCategory(codePoint, '8.0') !== 'Lo' ||
!safeCodePoint(codePoint, '8.0')
) {
return false
}
}
return true
}
const getAllSafeRanges = rangeSize => {
const allSafeRanges = []
for (let codePoint = 0; codePoint < (1 << 16) + (1 << 20); codePoint += rangeSize) {
if (safeRange(codePoint, codePoint + rangeSize)) {
allSafeRanges.push(codePoint)
}
}
return allSafeRanges
}
const allSafeRanges = getAllSafeRanges(1 << 8)
export const paddingBlockStart = String.fromCodePoint(allSafeRanges.shift())
export const blockStarts = allSafeRanges.slice(0, 1 << 8).map(x => String.fromCodePoint(x)).join('')
// There are now implementations of
// Base65536 in numerous programming languages beyond the original JavaScript,
// and I consider it *extremely* undesirable to introduce multiple incompatible
// versions of the Base65536 encoding, so this program is unlikely to ever be
// run again unless a very serious problem is discovered.
// Still, if I tried this again, here are some things I might do differently:
// * Include other safe Unicode General Categories, so that Base65536 output
// does not seemingly consist only of CJK characters. At present, to my eye (not
// being a reader of those languages) it resembles ordinary CJK text. I would
// prefer Base65536 to be more obviously a kludge of diverse scripts, so that it
// can't be mistaken for text, even by someone who knows none of those scripts.
// Naturally the lion's share of code points will have to remain CJK though,
// that's just where the safe code points mostly are.
// * Try harder to select lower code points, those with 3-byte encodings in
// UTF-8 and 2-byte encodings in UTF-16, to reduce the average size of the
// encoded Base65536 output (even though Base65536 is optimised for UTF-32).
// * Perhaps try to find larger blocks of 512 or 1024 code points rather than
// 256, or other techniques for reducing the size of the lookup tables.
// * Choose all characters with the same East_Asian_Width of 'W' (wide).

51
scripts/gen.spec.js Normal file
View File

@ -0,0 +1,51 @@
/* eslint-env jest */
import { eastAsianWidth } from 'safe-code-point'
import { paddingBlockStart, blockStarts } from './gen'
describe('gen', () => {
it('generates the correct padding block', () => {
expect(paddingBlockStart).toBe('ᔀ')
})
it('generates the correct blocks', () => {
expect(blockStarts).toBe(
'㐀㔀㘀㜀㠀㤀㨀㬀㰀㴀㸀㼀䀀䄀䈀䌀' +
'䐀䔀䘀䜀䠀䤀䨀䬀䰀一伀倀儀刀匀吀' +
'唀嘀圀堀夀娀嬀尀崀帀开怀愀戀挀搀' +
'攀昀最栀椀樀欀氀洀渀漀瀀焀爀猀琀' +
'甀瘀眀砀礀稀笀簀紀縀缀耀脀舀茀萀' +
'蔀蘀蜀蠀褀言謀谀贀踀輀退鄀鈀錀鐀' +
'销阀需頀餀騀鬀鰀鴀鸀ꄀꈀꌀꔀ𐘀𒀀' +
'𒄀𒈀𓀀𓄀𓈀𓌀𔐀𔔀𖠀𖤀𠀀𠄀𠈀𠌀𠐀𠔀' +
'𠘀𠜀𠠀𠤀𠨀𠬀𠰀𠴀𠸀𠼀𡀀𡄀𡈀𡌀𡐀𡔀' +
'𡘀𡜀𡠀𡤀𡨀𡬀𡰀𡴀𡸀𡼀𢀀𢄀𢈀𢌀𢐀𢔀' +
'𢘀𢜀𢠀𢤀𢨀𢬀𢰀𢴀𢸀𢼀𣀀𣄀𣈀𣌀𣐀𣔀' +
'𣘀𣜀𣠀𣤀𣨀𣬀𣰀𣴀𣸀𣼀𤀀𤄀𤈀𤌀𤐀𤔀' +
'𤘀𤜀𤠀𤤀𤨀𤬀𤰀𤴀𤸀𤼀𥀀𥄀𥈀𥌀𥐀𥔀' +
'𥘀𥜀𥠀𥤀𥨀𥬀𥰀𥴀𥸀𥼀𦀀𦄀𦈀𦌀𦐀𦔀' +
'𦘀𦜀𦠀𦤀𦨀𦬀𦰀𦴀𦸀𦼀𧀀𧄀𧈀𧌀𧐀𧔀' +
'𧘀𧜀𧠀𧤀𧨀𧬀𧰀𧴀𧸀𧼀𨀀𨄀𨈀𨌀𨐀𨔀'
)
})
it('has the right East_Asian_Width properties', () => {
// All 256 characters in each block have the same East_Asian_Width property.
// 243 of the blocks are 'W' (wide), the other 13 + 1 are 'N' (neutral,
// which in effect is narrow). This is significant when considering
// rendering and wrapping.
const allBlockStarts = [...blockStarts].map(x => x.codePointAt(0))
const neutralBlockStarts = [...'ᔀꔀ𐘀𒀀𒄀𒈀𓀀𓄀𓈀𓌀𔐀𔔀𖠀𖤀'].map(x => x.codePointAt(0))
allBlockStarts.forEach(blockStart => {
for (let i = 0; i < 1 << 8; i++) {
const codePoint = blockStart + i
const isInNeutralBlock = neutralBlockStarts
.some(neutralBlockStart =>
neutralBlockStart <= codePoint &&
codePoint < neutralBlockStart + (1 << 8)
)
expect(eastAsianWidth(codePoint, '8.0')).toBe(isInNeutralBlock ? 'N' : 'W')
}
})
})
})