Skip to content

re-implement Base Unicode features using utf8proc_jll #381

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 11 commits into
base: main
Choose a base branch
from
5 changes: 3 additions & 2 deletions Project.toml
Original file line number Diff line number Diff line change
Expand Up @@ -3,11 +3,12 @@ uuid = "70703baa-626e-46a2-a12c-08ffd08c73b4"
authors = ["Claire Foster <[email protected]> and contributors"]
version = "0.4.6"

[deps]
UnicodeNext = "7b9d9d2f-29eb-4111-b31d-f1cfc33d1412"

[compat]
julia = "1.0"

[deps]

[extras]
Logging = "56ddb016-857b-54e1-b83d-db4d58db5568"
Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
Expand Down
2 changes: 2 additions & 0 deletions src/JuliaSyntax.jl
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
module JuliaSyntax

using UnicodeNext

# Conservative list of exports - only export the most common/useful things
# here.

Expand Down
54 changes: 16 additions & 38 deletions src/literal_parsing.jl
Original file line number Diff line number Diff line change
Expand Up @@ -329,48 +329,27 @@ function unescape_julia_string(io::IO, txtbuf::Vector{UInt8},
end

#-------------------------------------------------------------------------------
# Unicode normalization. As of Julia 1.8, this is part of Base and the Unicode
# stdlib under the name `Unicode.julia_chartransform`. See
# https://github.com/JuliaLang/julia/pull/42561
#
# To allow use on older Julia versions and to workaround the bug
# https://github.com/JuliaLang/julia/issues/45716
# we reproduce a specialized version of that logic here.
# Unicode normalization.

# static wrapper around user callback function
function utf8proc_custom_func(codepoint::UInt32, ::Ptr{Cvoid})::UInt32
(codepoint == 0x025B ? 0x03B5 : # 'ɛ' => 'ε'
codepoint == 0x00B5 ? 0x03BC : # 'µ' => 'μ'
codepoint == 0x00B7 ? 0x22C5 : # '·' => '⋅'
codepoint == 0x0387 ? 0x22C5 : # '·' => '⋅'
codepoint == 0x2212 ? 0x002D : # '−' (\minus) => '-'
codepoint == 0x210F ? 0x0127 : # 'ℏ' (\hslash) => 'ħ' \hbar
codepoint)
end

function utf8proc_decompose(str, options, buffer, nwords)
ret = ccall(:utf8proc_decompose_custom, Int, (Ptr{UInt8}, Int, Ptr{UInt8}, Int, Cint, Ptr{Cvoid}, Ptr{Cvoid}),
str, sizeof(str), buffer, nwords, options,
@cfunction(utf8proc_custom_func, UInt32, (UInt32, Ptr{Cvoid})), C_NULL)
ret < 0 && Base.Unicode.utf8proc_error(ret)
return ret
end

function utf8proc_map(str::Union{String,SubString{String}}, options::Integer)
nwords = utf8proc_decompose(str, options, C_NULL, 0)
buffer = Base.StringVector(nwords*4)
nwords = utf8proc_decompose(str, options, buffer, nwords)
nbytes = ccall(:utf8proc_reencode, Int, (Ptr{UInt8}, Int, Cint), buffer, nwords, options)
nbytes < 0 && Base.Unicode.utf8proc_error(nbytes)
return String(resize!(buffer, nbytes))
function normalize_identifier(c::Char)
if c <= '~'
return c # ASCII common case
end
return c == '\u025B' ? '\u03B5' : # 'ɛ' => 'ε'
c == '\u00B5' ? '\u03BC' : # 'µ' => 'μ'
c == '\u00B7' ? '\u22C5' : # '·' => '⋅'
c == '\u0387' ? '\u22C5' : # '·' => '⋅'
c == '\u2212' ? '\u002D' : # '−' (\minus) => '-'
c == '\u210F' ? '\u0127' : # 'ℏ' (\hslash) => 'ħ' \hbar
c
end

function normalize_identifier(str)
flags = Base.Unicode.UTF8PROC_STABLE | Base.Unicode.UTF8PROC_COMPOSE
return isascii(str) ? str : utf8proc_map(str, flags)
function normalize_identifier(str::AbstractString)
isascii(str) ? str :
UnicodeNext.normalize(str, stable=true, compose=true,
chartransform=normalize_identifier)
end


#-------------------------------------------------------------------------------
function parse_julia_literal(txtbuf::Vector{UInt8}, head::SyntaxHead, srcrange)
k = kind(head)
Expand Down Expand Up @@ -451,4 +430,3 @@ function parse_julia_literal(txtbuf::Vector{UInt8}, head::SyntaxHead, srcrange)
ErrorVal()
end
end

2 changes: 1 addition & 1 deletion src/source_files.jl
Original file line number Diff line number Diff line change
Expand Up @@ -145,7 +145,7 @@ function _print_marker_line(io, prefix_str, str, underline, singleline, color,
# Getting exactly the same width of whitespace as `str` is tricky.
# Especially for mixtures of tabs and spaces.
# tabs are zero width according to textwidth
indent = join(isspace(c) ? c : repeat(' ', textwidth(c)) for c in prefix_str)
indent = join(UnicodeNext.isspace(c) ? c : repeat(' ', textwidth(c)) for c in prefix_str)

# Assume tabs are 4 wide rather than 0. (fixme: implement tab alignment?)
w = textwidth(str) + 4*count(c->c=='\t', str)
Expand Down
163 changes: 133 additions & 30 deletions src/tokenize.jl
Original file line number Diff line number Diff line change
Expand Up @@ -2,27 +2,128 @@ module Tokenize

export tokenize, untokenize

using ..JuliaSyntax: JuliaSyntax, Kind, @K_str, @KSet_str
using ..JuliaSyntax: JuliaSyntax, Kind, @K_str, @KSet_str, @u8_str

import ..JuliaSyntax: kind,
is_literal, is_error, is_contextual_keyword, is_word_operator

#-------------------------------------------------------------------------------
# Character-based predicates for tokenization
import Base.Unicode

const EOF_CHAR = typemax(Char)

function is_identifier_char(c::Char)
c == EOF_CHAR && return false
isvalid(c) || return false
return Base.is_id_char(c)
# Julia identifier parsing predicates

using UnicodeNext

import UnicodeNext: CATEGORY_CS, CATEGORY_LL, CATEGORY_LM, CATEGORY_LO,
CATEGORY_LT, CATEGORY_LU, CATEGORY_MC, CATEGORY_ME, CATEGORY_MN,
CATEGORY_ND, CATEGORY_NL, CATEGORY_NO, CATEGORY_PC, CATEGORY_PD,
CATEGORY_PO, CATEGORY_SC, CATEGORY_SK, CATEGORY_SO, CATEGORY_ZS

# port of is_wc_cat_id_start from julia/src/flisp/julia_extensions.c
function _is_identifier_start_char(c::UInt32, cat::Integer)
return (cat == CATEGORY_LU || cat == CATEGORY_LL ||
cat == CATEGORY_LT || cat == CATEGORY_LM ||
cat == CATEGORY_LO || cat == CATEGORY_NL ||
cat == CATEGORY_SC || # allow currency symbols
# other symbols, but not arrows or replacement characters
(cat == CATEGORY_SO && !(c >= 0x2190 && c <= 0x21FF) &&
c != 0xfffc && c != 0xfffd &&
c != 0x233f && # notslash
c != 0x00a6) || # broken bar

# math symbol (category Sm) whitelist
(c >= 0x2140 && c <= 0x2a1c &&
((c >= 0x2140 && c <= 0x2144) || # ⅀, ⅁, ⅂, ⅃, ⅄
c == 0x223f || c == 0x22be || c == 0x22bf || # ∿, ⊾, ⊿
c == 0x22a4 || c == 0x22a5 || # ⊤ ⊥

(c >= 0x2200 && c <= 0x2233 &&
(c == 0x2202 || c == 0x2205 || c == 0x2206 || # ∂, ∅, ∆
c == 0x2207 || c == 0x220e || c == 0x220f || # ∇, ∎, ∏
c == 0x2200 || c == 0x2203 || c == 0x2204 || # ∀, ∃, ∄
c == 0x2210 || c == 0x2211 || # ∐, ∑
c == 0x221e || c == 0x221f || # ∞, ∟
c >= 0x222b)) || # ∫, ∬, ∭, ∮, ∯, ∰, ∱, ∲, ∳

(c >= 0x22c0 && c <= 0x22c3) || # N-ary big ops: ⋀, ⋁, ⋂, ⋃
(c >= 0x25F8 && c <= 0x25ff) || # ◸, ◹, ◺, ◻, ◼, ◽, ◾, ◿

(c >= 0x266f &&
(c == 0x266f || c == 0x27d8 || c == 0x27d9 || # ♯, ⟘, ⟙
(c >= 0x27c0 && c <= 0x27c1) || # ⟀, ⟁
(c >= 0x29b0 && c <= 0x29b4) || # ⦰, ⦱, ⦲, ⦳, ⦴
(c >= 0x2a00 && c <= 0x2a06) || # ⨀, ⨁, ⨂, ⨃, ⨄, ⨅, ⨆
(c >= 0x2a09 && c <= 0x2a16) || # ⨉, ⨊, ⨋, ⨌, ⨍, ⨎, ⨏, ⨐, ⨑, ⨒, ⨓, ⨔, ⨕, ⨖
c == 0x2a1b || c == 0x2a1c)))) || # ⨛, ⨜

(c >= 0x1d6c1 && # variants of \nabla and \partial
(c == 0x1d6c1 || c == 0x1d6db ||
c == 0x1d6fb || c == 0x1d715 ||
c == 0x1d735 || c == 0x1d74f ||
c == 0x1d76f || c == 0x1d789 ||
c == 0x1d7a9 || c == 0x1d7c3)) ||

# super- and subscript +-=()
(c >= 0x207a && c <= 0x207e) ||
(c >= 0x208a && c <= 0x208e) ||

# angle symbols
(c >= 0x2220 && c <= 0x2222) || # ∠, ∡, ∢
(c >= 0x299b && c <= 0x29af) || # ⦛, ⦜, ⦝, ⦞, ⦟, ⦠, ⦡, ⦢, ⦣, ⦤, ⦥, ⦦, ⦧, ⦨, ⦩, ⦪, ⦫, ⦬, ⦭, ⦮, ⦯

# Other_ID_Start
c == 0x2118 || c == 0x212E || # ℘, ℮
(c >= 0x309B && c <= 0x309C) || # katakana-hiragana sound marks

# bold-digits and double-struck digits
(c >= 0x1D7CE && c <= 0x1D7E1)) # 𝟎 through 𝟗 (inclusive), 𝟘 through 𝟡 (inclusive)
end

# utility function to return the ASCII byte if isascii(c),
# and otherwise (for non-ASCII or invalid chars) return 0xff,
# based on the isascii source code.
@inline function _ascii_byte(c::Char)
x = bswap(reinterpret(UInt32, c))
return x < 0x80 ? x % UInt8 : 0xff
end

# from jl_id_start_char in julia/src/flisp/julia_extensions.c
function is_identifier_start_char(c::Char)
c == EOF_CHAR && return false
isvalid(c) || return false
return Base.is_id_start_char(c)
a = _ascii_byte(c)
if a != 0xff # ascii fast path
return (a >= u8"A" && a <= u8"Z") || (a >= u8"a" && a <= u8"z") || a == u8"_"
end
if c < Char(0xA1) || !isvalid(c)
return false
end
x = UInt32(c)
return _is_identifier_start_char(x, UnicodeNext.category_code(x))
end

# from jl_id_char in julia/src/flisp/julia_extensions.c
function is_identifier_char(c::Char)
a = _ascii_byte(c)
if a != 0xff # ascii fast path
return (a >= u8"A" && a <= u8"Z") || (a >= u8"a" && a <= u8"z") ||
a == u8"_" || (a >= u8"0" && a <= u8"9") || a == u8"!"
end
if c < Char(0xA1) || !isvalid(c)
return false
end
x = UInt32(c)
cat = UnicodeNext.category_code(x)
_is_identifier_start_char(x, cat) && return true
if (cat == CATEGORY_MN || cat == CATEGORY_MC ||
cat == CATEGORY_ND || cat == CATEGORY_PC ||
cat == CATEGORY_SK || cat == CATEGORY_ME ||
cat == CATEGORY_NO ||
# primes (single, double, triple, their reverses, and quadruple)
(x >= 0x2032 && x <= 0x2037) || (x == 0x2057))
return true
end
return false
end

function is_invisible_char(c::Char)
Expand All @@ -44,15 +145,15 @@ end
# Chars that we will never allow to be part of a valid non-operator identifier
function is_never_id_char(ch::Char)
isvalid(ch) || return true
cat = Unicode.category_code(ch)
cat = UnicodeNext.category_code(ch)
c = UInt32(ch)
return (
# spaces and control characters:
(cat >= Unicode.UTF8PROC_CATEGORY_ZS && cat <= Unicode.UTF8PROC_CATEGORY_CS) ||
(cat >= CATEGORY_ZS && cat <= CATEGORY_CS) ||

# ASCII and Latin1 non-connector punctuation
(c < 0xff &&
cat >= Unicode.UTF8PROC_CATEGORY_PD && cat <= Unicode.UTF8PROC_CATEGORY_PO) ||
cat >= CATEGORY_PD && cat <= CATEGORY_PO) ||

c == UInt32('`') ||

Expand Down Expand Up @@ -137,10 +238,10 @@ end
if (u < 0xa1 || u > 0x10ffff)
return false
end
cat = Base.Unicode.category_code(u)
if (cat == Base.Unicode.UTF8PROC_CATEGORY_MN ||
cat == Base.Unicode.UTF8PROC_CATEGORY_MC ||
cat == Base.Unicode.UTF8PROC_CATEGORY_ME)
cat = UnicodeNext.category_code(u)
if (cat == CATEGORY_MN ||
cat == CATEGORY_MC ||
cat == CATEGORY_ME)
return true
end
# Additional allowed cases
Expand Down Expand Up @@ -226,7 +327,7 @@ end
@inline ishex(c::Char) = isdigit(c) || ('a' <= c <= 'f') || ('A' <= c <= 'F')
@inline isbinary(c::Char) = c == '0' || c == '1'
@inline isoctal(c::Char) = '0' ≤ c ≤ '7'
@inline iswhitespace(c::Char) = (isvalid(c) && Base.isspace(c)) || c === '\ufeff'
@inline iswhitespace(c::Char) = (isvalid(c) && UnicodeNext.isspace(c)) || c === '\ufeff'

struct StringState
triplestr::Bool
Expand Down Expand Up @@ -1289,25 +1390,27 @@ function lex_identifier(l::Lexer, c)
h = simple_hash(c, UInt64(0))
n = 1
ascii = isascii(c)
graphemestate = Ref(Int32(ascii)) # all ASCII id chars are UTF8PROC_BOUNDCLASS_OTHER
graphemestate_peek = Ref(zero(Int32))
graphemestate = UnicodeNext.GraphemeState(c)
while true
pc, ppc = dpeekchar(l)
ascii = ascii && isascii(pc)
pc_byte = _ascii_byte(pc)
ascii = ascii && pc_byte != 0xff
if ascii # fast path
pc_byte = pc % UInt8
@inbounds if (pc_byte == UInt8('!') && ppc == '=') || !ascii_is_identifier_char[pc_byte+1]
break
end
elseif Unicode.isgraphemebreak!(graphemestate, c, pc)
if (pc == '!' && ppc == '=') || !is_identifier_char(pc)
break
end
elseif pc in ('\u200c','\u200d') # ZWNJ/ZWJ control characters
# ZWJ/ZWNJ only within grapheme sequences, not at end
graphemestate_peek[] = graphemestate[]
if Unicode.isgraphemebreak!(graphemestate_peek, pc, ppc)
break
else
graphemestate, isbreak = UnicodeNext.isgraphemebreak(graphemestate, pc)
if isbreak
if ((pc == '!' && ppc == '=') || !is_identifier_char(pc))
break
end
elseif pc in ('\u200c','\u200d') # ZWNJ/ZWJ control characters
# ZWJ/ZWNJ only within grapheme sequences, not at end
_, isbreak_peek = UnicodeNext.isgraphemebreak(graphemestate, ppc)
if isbreak_peek
break
end
end
end
c = readchar(l)
Expand Down
3 changes: 1 addition & 2 deletions test/tokenize.jl
Original file line number Diff line number Diff line change
Expand Up @@ -44,8 +44,7 @@ end
end # testset

@testset "tokenize unicode" begin
# FIXME: rm VERSION check once we implement our own is_identifier_char
emoji = VERSION < v"1.5" ? "😄" : "\U1F3F3\UFE0F\U200D\U1F308" # 🏳️‍🌈 requires newer Unicode
emoji = "\U1F3F3\UFE0F\U200D\U1F308" # == "🏳️‍🌈"
str = "𝘋 =2"*emoji
for s in [str, IOBuffer(str)]
l = tokenize(s)
Expand Down