模块:Gilatoft/Compiler/Tokenizer

来自「荏苒之境」
Sicusa留言 | 贡献2025年8月7日 (四) 23:38的版本 (Sicusa移动页面模块:Gltd-Gilatoft/Compiler/Tokenizer.lua模块:Gilatoft/Compiler/Tokenizer,不留重定向)
(差异) ←上一版本 | 最后版本 (差异) | 下一版本→ (差异)

此模块的文档可以在模块:Gilatoft/Compiler/Tokenizer/doc创建

local byte = string.byte
local char = string.char
local sub = string.sub
local gsub = string.gsub
local lower = string.lower
local unpack = table.unpack
local concat = table.concat

local tokenizer = {}

local BYTE_COMMA        = byte(",")
local BYTE_FULL_STOP    = byte(".")
local BYTE_LIST_START   = byte("[")
local BYTE_LIST_END     = byte("]")
local BYTE_STRING_START = byte("\"")
local BYTE_STRING_END   = byte("\"")

local RESERVED_CHARACTERS = {
    [BYTE_COMMA]        = true,
    [BYTE_FULL_STOP]    = true,
    [BYTE_LIST_START]   = true,
    [BYTE_LIST_END]     = true,
    [BYTE_STRING_START] = true,
    [BYTE_STRING_END]   = true,
    [0]                 = true
}

local BYTE_ESCAPE = byte("\\")

local RAW_ESCAPE_TABLE = {
    a = "\a", b = "\b", f = "\f", n = "\n",
    r = "\r", t = "\t", v = "\v",
    ["\\"] = "\\", ["0"] = "\0", [" "] = " ",
    ["\""] = "\"",
    ["["] = "[", ["]"] = "]"
}

local RAW_UNESCAPE_TABLE = {}
for k, v in pairs(RAW_ESCAPE_TABLE) do
    RAW_UNESCAPE_TABLE[v] = "\\"..k
end

local ESCAPE_TABLE = {}
for k, v in pairs(RAW_ESCAPE_TABLE) do
    ESCAPE_TABLE[byte(k)] = byte(v)
end

local BYTE_SPACE = byte(" ")
local BYTE_TAB   = byte("\t")
local BYTE_LF    = byte("\n")

local WHITE_CHARACTERS = {
    [BYTE_SPACE] = true,
    [BYTE_TAB]   = true,
    [BYTE_LF]    = true
}

local BYTE_A = byte("a")
local BYTE_O = byte("o")
local BYTE_U = byte("u")
local BYTE_E = byte("e")
local BYTE_I = byte("i")

local VOWELS = {
    [BYTE_A] = true,
    [BYTE_O] = true,
    [BYTE_U] = true,
    [BYTE_E] = true,
    [BYTE_I] = true
}

local BYTE_ACCENT_MARK = byte("'")
local BYTE_PROPER_NOUN_MARK = byte("'")

local PRONOUNS = {
    -- personal pronouns

    ["va"]  = {"1s", "nominative"},
    ["iva"] = {"1p", "nominative"},
    ["na"]  = {"2s", "nominative"},
    ["ina"] = {"2p", "nominative"},
    ["la"]  = {"3s", "nominative"},
    ["ila"] = {"3p", "nominative"},

    -- demonstrative pronouns

    ["ta"]  = {"this", "nominative"},
    ["ita"] = {"these", "nominative"},
    ["pa"]  = {"that", "nominative"},
    ["ipa"] = {"those", "nominative"},

    -- declarative pronouns

    ["sa"]  = {"any", "nominative"},
    ["ga"]  = {"all", "nominative"},
    ["ma"]  = {"each", "nominative"},

    -- assertive pronouns

    ["asa"]  = {"assert any", "nominative"},
    ["aga"]  = {"assert all", "nominative"},
    ["ama"]  = {"assert each", "nominative"},

    -- interrogative pronouns

    ["esa"]  = {"query any", "nominative"},
    ["ega"]  = {"query all", "nominative"},
    ["ema"]  = {"query each", "nominative"}
}

local function conjugate_pronouns(pronouns)
    local conjugated = {}
    for pron, desc in pairs(pronouns) do
        local stem = pron:sub(1, #pron - 1)
        conjugated[stem.."i"] = {desc[1], "accusative"}
        conjugated[stem.."o"] = {desc[1], "dative"}
        conjugated[stem.."u"] = {desc[1], "genitive"}
        conjugated[stem.."e"] = {desc[1], "oblique"}
    end

    for art, desc in pairs(conjugated) do
        pronouns[art] = desc
    end
end

conjugate_pronouns(PRONOUNS)

local LEXICAL_ASPECT_MARKS = {
    ["a"]  = {"dynamic", "atelic", "durative"},
    ["ae"] = {"dynamic", "atelic", "punctual"},
    ["o"]  = {"dynamic", "telic", "durative"},
    ["oe"] = {"dynamic", "telic", "punctual"},
    ["e"]  = {"static", "atelic", "durative"},
    ["ei"] = {"static", "atelic", "punctual"},
    ["i"]  = {"static", "telic", "durative"},
    ["ie"] = {"static", "telic", "punctual"}
}

local GRAMMATICAL_ASPECT_MARKS = {
    ["a"]  = "empirical",
    ["i"]  = "initial",
    ["o"]  = "progressive",
    ["u"]  = "perfective",
    ["e"]  = "continuous"
}

local LEXICAL_PREFIXES = {
    ["a"]    = "accomplished",
    ["ju"]   = "defective",
    ["bi"]   = "negative",
    ["ga"]   = "opposite",
    ["si"]   = "analogous",
    ["zu"]   = "posterior",
    ["gi"]   = "transcendental",
    ["cu"]   = "reflexive",
    ["cuta"] = "voluntary",
    ["pa"]   = "mutual",
    ["di"]   = "half",
    ["en"]   = "singular",
    ["mo"]   = "dual",
    ["sta"]  = "trial",
    ["mu"]   = "plural",
    ["na"]   = "repeat",
    ["ho"]   = "common",
    ["o"]    = "greator",
    ["li"]   = "smaller",
    ["so"]   = "orignal",
    ["lu"]   = "convergent",
    ["ca"]   = "separative",
    ["fla"]  = "transfering",
    ["la"]   = "forward",
    ["ce"]   = "backward",
    ["pju"]  = "before",
    ["si"]   = "optional",
    ["se"]   = "parrallel",
    ["me"]   = "condition",
    ["hi"]   = "reason",
    ["va"]   = "result",
    ["su"]   = "purpose",
    ["de"]   = "theme",
    ["vi"]   = "synonym"
}

local GRAMMATICAL_POSTFIXES = {
    ["n"]   = {"predicative", "active", 1},
    ["s"]   = {"predicative", "active", 2},
    ["sai"] = {"predicative", "active", 2, "honorific"},
    [0]     = {"predicative", "active", 3},
    ["vai"] = {"predicative", "active", 3, "honorific"},

    ["ni"] = {"predicative", "passive", 1},
    ["si"] = {"predicative", "passive", 2},
    ["vi"] = {"predicative", "passive", 3},

    ["nu"] = {"predicative", "employment", 1},
    ["su"] = {"predicative", "employment", 2},
    ["vu"] = {"predicative", "employment", 3},

    -- adjunct adverb

    ["f"]  = {"adverbial", "adjunct", "active"},
    ["fi"] = {"adverbial", "adjunct", "passive"},
    ["fu"] = {"adverbial", "adjunct", "employment"},

    -- determinator adverb

    ["t"]  = {"adverbial", "determinator", "active"},
    ["ti"] = {"adverbial", "determinator", "passive"},
    ["tu"] = {"adverbial", "determinator", "employment"}
}

local CASE_MARKS = {
    ["a"]  = "nominative",
    ["i"]  = "accusative",
    ["o"]  = "dative",
    ["u"]  = "genitive",
    ["e"]  = "oblique"
}

local NOUN_POSTFIXES = {
    -- infinitive verb

    ["ns"]  = {"adjective", "active"},
    ["nsi"] = {"adjective", "passive"},
    ["nsu"] = {"adjective", "employment"},

    -- gerund

    ["gn"]  = {"gerund", "active"},
    ["gni"] = {"gerund", "passive"},
    ["gnu"] = {"gerund", "employment"},

    -- semantic role

    ["nt"] = {"role", "agent"},
    ["r"]  = {"role", "patient"},
    ["m"]  = {"role", "experiencer"},
    ["d"]  = {"role", "scene"},
    ["ft"] = {"role", "measure"},
    ["vz"] = {"role", "outcome"},
    ["g"]  = {"role", "depletion"}
}

local POSTFIXES = {}
for k, v in pairs(GRAMMATICAL_POSTFIXES) do POSTFIXES[k] = v end
for k, v in pairs(NOUN_POSTFIXES) do POSTFIXES[k] = v end

local function tokenizer_error(state, message)
    error(("[tokenizer] %s:%s:%s: %s"):format(
        state.location,
        state.line,
        state.column,
        message), 0)
end

local function read_byte(state, source)
    local index = state.index
    local cb = byte(source, index)
    if cb == nil then return nil end

    if cb == BYTE_LF then
        state.line = state.line + 1
        state.column = 1
    else
        state.column = state.column + 1
    end

    state.index = index + 1
    return cb
end

local function try_byte(state, source, cb)
    local sb = byte(source, state.index)
    if sb ~= cb then
        return nil
    else
        return read_byte(state, source)
    end
end

local function close_with_byte(state, source, cb)
    local sb = byte(source, state.index)
    if sb == nil then
        tokenizer_error(state, ("end of source ('%s' expected)")
            :format(char(cb)))
    elseif sb ~= cb then
        return nil
    else
        return read_byte(state, source)
    end
end

local function skip_white(state, source)
    while true do
        local cb = byte(source, state.index)
        if cb == nil or not WHITE_CHARACTERS[cb] then
            return
        end
        read_byte(state, source)
    end
end

local function parse_numeric_escape(state, source)
    local cb1 = byte(source, index)
    local cb2 = byte(source, index + 1)
    local cb3 = byte(source, index + 2)

    error("TODO")
end

local function read_character_byte(state, source)
    local cb = read_byte(state, source)
    if cb == nil then return nil end

    if cb == BYTE_ESCAPE then
        local escape_head = read_byte(state, source)
        if escape_head == nil then
            tokenizer_error(state, "end of source (escape character expected)")
        end
        return ESCAPE_TABLE[escape_head]
            or parse_numeric_escape(state, source)
    end

    return cb
end

local function read_string(state, source)
    if not try_byte(state, source, BYTE_STRING_START) then
        return nil
    end

    local cs = {}
    while true do
        if close_with_byte(state, source, BYTE_STRING_END) then
            break
        end
        cs[#cs+1] = read_character_byte(state, source)
    end
    return char(unpack(cs))
end

local function read_list(state, source, element_reader)
    if not try_byte(state, source, BYTE_LIST_START) then
        return nil
    end

    local es = {}
    while true do
        if close_with_byte(state, source, BYTE_LIST_END) then
            break
        end
        es[#es+1] = element_reader(state, source)
    end
    return es
end

local function read_letters(state, source)
    local cs = {}

    while true do
        local cb = byte(source, state.index)
        if cb == nil or WHITE_CHARACTERS[cb] or RESERVED_CHARACTERS[cb] then
            break
        end
        cs[#cs+1] = cb
        read_byte(state, source)
    end

    return #cs > 0 and char(unpack(cs))
end

local function calculate_stem_consonant_cluster_count(str)
    local count = 0
    local i = #str
    local cb

    while i > 0 do
        cb = byte(str, i)
        if cb == BYTE_ACCENT_MARK then
            break
        elseif not VOWELS[cb] then
            count = count + 1
            while true do
                i = i - 1
                if i <= 0 then break end
                cb = byte(str, i)
                if VOWELS[cb] then break end
            end
        end
        i = i - 1
    end

    return count
end

local function get_postfix(state, str)
    local candidate
    local candidate_type
    local candidate_desc

    local acc = ""
    local prev_cb
    local cb
    local desc

    if calculate_stem_consonant_cluster_count(str) <= 2 then
        return nil
    end

    for i = #str, 1, -1 do
        cb = byte(str, i)
        acc = char(cb)..acc

        desc = POSTFIXES[acc]
        if desc then
            candidate = acc
            candidate_desc = desc
        end
    end

    return candidate, candidate_desc
end

local function deconstruct_stem(state, stem)
    local root_lhs = ""
    local root_rhs = ""
    local mark_lhs = ""
    local mark_rhs = ""

    local cb
    local i = #stem

    if not VOWELS[byte(stem, i)] then
        tokenizer_error(state, "invalid stem: "..stem)
    end

    while i > 0 do
        cb = byte(stem, i)
        if not VOWELS[cb] then break end
        mark_rhs = char(cb)..mark_rhs
        i = i - 1
    end

    if #mark_rhs == 0 then
        return nil, 2
    end

    while i > 0 do
        cb = byte(stem, i)
        if VOWELS[cb] then break end
        root_rhs = char(cb)..root_rhs
        i = i - 1
    end

    if #root_rhs == 0 then
        tokenizer_error(state, "invalid word root")
    end

    while i > 0 do
        cb = byte(stem, i)
        if not VOWELS[cb] then break end
        mark_lhs = char(cb)..mark_lhs
        i = i - 1
    end

    if #mark_lhs == 0 then
        return nil, 1
    end

    while i > 0 do
        cb = byte(stem, i)
        if VOWELS[cb] then break end
        root_lhs = char(cb)..root_lhs
        i = i - 1
    end

    if #root_lhs == 0 then
        tokenizer_error(state, "invalid word root")
    end

    return sub(stem, 1, i)..root_lhs.."."..root_rhs..".",
        mark_lhs, mark_rhs
end

local function read_token(state, source)
    local metadata = {
        location = state.location,
        index = state.index,
        line = state.line,
        column = state.column
    }

    local raw = read_letters(state, source)
    if not raw then return nil end

    local word_cache = state.word_cache
    local cache = word_cache[raw]
    if cache then
        return {metadata, cache}
    end

    local w = {}
    local token = {metadata, w}

    if byte(raw, #raw-1) == BYTE_PROPER_NOUN_MARK then
        local case = CASE_MARKS[char(byte(raw, #raw))]
        w.type = "noun"
        w.subtype = "proper"
        w.raw = raw
        w.stem = sub(raw, 1, #raw - 2)
        w.case = case
        word_cache[raw] = w
        return token
    end

    raw = lower(raw)
    w.raw = raw
    word_cache[raw] = w

    local pron_desc = PRONOUNS[raw]
    if pron_desc then
        w.type = "noun"
        w.subtype = "pronoun"
        w.case = pron_desc[2]
        w.detail = pron_desc
        w.root = sub(raw, 1, #raw-1).."."
        return token
    end

    local postfix, desc = get_postfix(state, raw)
    if postfix then
        raw = sub(raw, 1, #raw - #postfix)
        w.stem = raw
        w.subtype = desc[1]
        w.detail = desc

        if NOUN_POSTFIXES[postfix] then
            w.type = "noun"
        else
            w.type = "verb"
        end
    else
        w.type = "verb"
        w.stem = raw

        local desc = GRAMMATICAL_POSTFIXES[0]
        w.subtype = desc[1]
        w.detail = desc
    end

    if w.type == "verb" then
        local root, la, ga = deconstruct_stem(state, raw)
        if root == nil then
            tokenizer_error(state, la == 1
                and "lexical aspect mark required"
                or "grammatical aspect mark required")
        end

        local la_desc = LEXICAL_ASPECT_MARKS[la]
        local ga_desc = GRAMMATICAL_ASPECT_MARKS[ga]

        if not la_desc then
            tokenizer_error(state, "invalid lexical aspect mark: "..la)
        elseif not ga_desc then
            tokenizer_error(state, "invalid grammatical aspect mark: "..ga)
        end

        w.root = root
        w.lexical_aspect = la_desc
        w.grammatical_aspect = ga_desc
    elseif w.type == "noun" then
        local root, la, cs = deconstruct_stem(state, raw)
        if root == nil then
            tokenizer_error(state, la == 1
                and "lexical aspect mark required"
                or "case mark required")
        end

        local la_desc = LEXICAL_ASPECT_MARKS[la]
        local cs_desc = CASE_MARKS[cs]

        if not la_desc then
            tokenizer_error(state, "invalid lexical aspect mark: "..la)
        elseif not cs_desc then
            tokenizer_error(state, "invalid case mark: "..cs)
        end

        w.root = root
        w.lexical_aspect = la_desc
        w.case = cs_desc
    end

    return token
end

local function read_sentence(state, source)
    local clause = {}
    local sentence = {
        type = "sentence",
        clause
    }

    while true do
        skip_white(state, source)

        if close_with_byte(state, source, BYTE_FULL_STOP) then
            break
        elseif close_with_byte(state, source, BYTE_COMMA) then
            if #clause ~= 0 then
                clause = {}
                sentence[#sentence+1] = clause
            end
        else
            clause[#clause+1] =
                read_token(state, source)
                or read_string(state, source)
                or read_list(state, source, read_sentence)
                or tokenizer_error(state,
                    "unrecorgnized character"..byte(source, state.index))
        end
    end

    if #sentence == 1 and #clause == 0 then
        return nil
    end

    return sentence
end

local function read_text(state, source)
    local text = {
        type = "text"
    }
    
    while true do
        skip_white(state, source)
        if state.index > #source then
            break
        end
        text[#text+1] = read_sentence(state, source)
    end

    return text
end

tokenizer.read = function(source, location)
    return read_text({
        index = 1,
        line = 1,
        column = 1,
        location = location or "[source]",
        word_cache = {}
    }, source)
end

return tokenizer