Module:Hrkt-translit

From Wiktionary, the free dictionary
Jump to navigation Jump to search

This module will transliterate text in the Kana script. It is used to transliterate Southern Amami-Oshima, Japanese, Hachijō, Kikai, Miyako, Oki-No-Erabu, Northern Amami-Oshima, Yaeyama, Okinawan, Toku-No-Shima, Kunigami, Yonaguni, and Yoron. The module should preferably not be called directly from templates or other modules. To use it from a template, use {{xlit}}. Within a module, use Module:languages#Language:transliterate.

For testcases, see Module:Hrkt-translit/testcases.

Functions

tr(text, lang, sc)
Transliterates a given piece of text written in the script specified by the code sc, and language specified by the code lang.
When the transliteration fails, returns nil.

local concat = table.concat
local insert = table.insert
local load_data = mw.loadData
local toNFD = mw.ustring.toNFD
local umatch = mw.ustring.match

local m_ja = require("Module:ja")
local kata_to_hira = m_ja.kata_to_hira
local normalize_kana = m_ja.normalize_kana

local data_common
local glottal = "\1"
local disambig = "\2"
local cons = "b-df-hj-np-tvxz"

local export = {}

local function get_initial(text)
	return umatch(text, "(.+)%f[" .. umatch(text, ".$") .. "]") or text
end

local function handle_initials(data, d_voicing, d_semivoicing, initials, checked)
	if not data then
		return
	end
	for k, v in pairs(data) do
		if not checked[k] and umatch(v, "^%a+$") then
			local initial = get_initial(v)
			if initial:match("^[" .. cons .. "]+$") then
				initials[initial] = true
			end
			local v_initial, sv_initial = d_voicing[initial], d_semivoicing[initial]
			if v_initial and v_initial:match("^[" .. cons .. "]+$") then
				initials[v_initial] = true
			end
			if sv_initial and sv_initial:match("^[" .. cons .. "]+$") then
				initials[sv_initial] = true
			end
		end
		checked[k] = true
	end
end

function export.process_data(data, common)
	local initials, checked, d_voicing, d_semivoicing = {}, {}, data.tr_voicing, data.tr_semivoicing
	data.initials = initials
	if not common then
		data_common = data_common or load_data("Module:Hrkt-translit/data")
		d_voicing = d_voicing or data_common.tr_voicing
		d_semivoicing = d_semivoicing or data_common.tr_semivoicing
	end
	handle_initials(data.rom, d_voicing, d_semivoicing, initials, checked)
	if not common then
		handle_initials(data_common.rom, d_voicing, d_semivoicing, initials, checked)
	end
	return data
end

local function get_data(lang)
	data_common = data_common or load_data("Module:Hrkt-translit/data")
	local function inspect_table(t, ...)
		for i = 1, select("#", ...) do
			if type(t) == "table" then
				t = t[select(i, ...)]
			else return nil end
		end
		return t
	end
	if lang then
		local name_data = "Module:Hrkt-translit/data/" .. lang
		if package.loaders[2](name_data) then
			local data_lang = load_data(name_data)
			return function(...)
				local item_lang, item_common = data_lang[...], data_common[...]
				for i = 2, select("#", ...) do 
					local key = select(i, ...)
					if type(item_lang) == "table" then
						item_lang = item_lang[key]
					else return inspect_table(item_common, select(i, ...)) end
					if type(item_common) == "table" then
						item_common = item_common[key]
					else return inspect_table(item_lang, select(i + 1, ...)) end
				end
				if item_lang ~= nil then return item_lang else return item_common end
			end
		end
	end
	return function(...)
		return inspect_table(data_common[...], select(2, ...))
	end
end

local function do_voicing(i_last, result, result_sp, hist, d, key)
	local text = result[i_last]
	if not hist and result_sp[i_last] == "historical w" then
		text = "w" .. text
	end
	return text:gsub("^" .. get_initial(text), d(key))
end

function export.tr(text, lang, sc, options)
	if umatch(text, "[" .. mw.loadData("Module:ja/data/range").kanji .. "]") then
		require("Module:debug").track("ja/invalid Hrkt")
	end

	options = options or {}
	
	local result = {[0] = ""}
	local result_sp = {}
	
	local d = get_data(lang)
	
	local function getlast(i_start, predicate_good, predicate_bad)
		local in_xml = false
		for i = i_start or #result, 1, -1 do
			if in_xml then
				if result[i] == "<" then in_xml = false end
			elseif result[i] == ">" then
				in_xml = true
			else
				if (predicate_bad or function(index)
					return result_sp[index] == "stop"
				end)(i) then break end
				if (predicate_good or function(index)
					return result[index]:len() > 0 and result_sp[index] ~= "'"
				end)(i) then return i end
			end
		end
		return 0
	end
	
	-- normalize long vowels and iteration marks
	text = toNFD(kata_to_hira(normalize_kana(text)))
	
	for c in text:gmatch(".[\128-\191]*") do
		local rc = options.hist and d("rom_hist", c) or d("rom", c) or c
		local rc_sp = d("rom_sp", c)
		local i_last = getlast()
		
		if options.keep_dot and c == "." then
			rc = "."
		elseif c:match("%a") then
			rc_sp = "stop"
		end
		
		local repl_digraph = d("digraph", c, result[i_last])
		if repl_digraph then
			result[i_last], rc = repl_digraph, ""
			result_sp[i_last], rc_sp = nil, nil
		end
		
		if not options.hist then --はへ
			if d("flag_hahe", result_sp[i_last]) and (umatch(c, "[-%.゙゚]") or rc:match("%a") or rc == glottal) then
				result[i_last] = result_sp[i_last]
				result_sp[i_last] = nil
			end
			if d("flag_hahe", rc_sp) and (options.phonetic or result_sp[getlast(nil, function(i)
				return result[i]:len() > 0 and result_sp[i] ~= "'" or result_sp[i] == "stop"
			end, function() return false end)] == "stop" or result[i_last]:match"[-%a]" or result[i_last] == glottal) then
				rc = rc_sp
				rc_sp = nil
			end
		end
		
		if rc:match("%a") and umatch(result[i_last], "^[,%.?!:)Ӡ]$") then --space and punctuations
			result[i_last] = result[i_last] .. " "
		elseif umatch(rc, "^[(“]$") and result[i_last]:match("%a") then
			rc = " " .. rc
		end
		
		if rc_sp == "voiced" then -- voicing
			result[i_last] = do_voicing(i_last, result, result_sp, options.hist, d, "tr_voicing")
		elseif rc_sp == "semivoiced" then
			result[i_last] = do_voicing(i_last, result, result_sp, options.hist, d, "tr_semivoicing")
		end
		
		if rc:match("[" .. cons .. "]+" .. "$") and rc_sp ~= "stop" then
			rc_sp = "coda"
		end
		
		local r_last = result[i_last]
		local r_lastlast = r_last:match"^.*(%a%A*)$" --vowel clusters or stop consonants
		if r_lastlast and r_lastlast:match("[aiueo]") then
			if rc:match("^%-[yw]") and r_last:match("^[" .. cons .. "yw]") then
				local rc_first = rc:sub(2, 2)
				r_last = #r_last > 1 and r_last:sub(1, -2) or r_last
				if not (rc_first == "y" and d("flag_postalveolarconsonant", r_last)) then
					r_last = r_last .. rc_first
				end
				result[i_last] = r_last
				rc = rc:sub(3)
			elseif options.hist and r_last:match("^[" .. cons .. "]") and (
				r_lastlast == "i" and rc:sub(1, 1) == "y" or
				r_lastlast == "u" and rc:sub(1, 1) == "w"
			) then
				local rc_first = rc:sub(1, 1)
				r_last = r_last:sub(1, -2)
				if not (rc_first == "y" and d("flag_postalveolarconsonant", r_last)) then
					r_last = r_last .. rc_first
				end
				result[i_last] = r_last
				rc = rc:sub(2)
			elseif rc:match"^%-[yw]?[aiueo]$" then
				rc = rc:sub(2)
				if r_lastlast == rc then
					result[i_last] = r_last .. r_lastlast
					rc = ""
				elseif d("flag_specialconsonant", r_last) then
					result[i_last] = r_last:sub(1, -2)
				elseif r_lastlast == "i" then
					result[i_last] = r_last:sub(1, -2) .. "y"
				elseif r_lastlast:match("[ou]") and rc ~= "u" then
					result[i_last] = r_last:sub(1, -2) .. "w"
				elseif #r_last > 1 then
					result[i_last] = r_last:sub(1, -2)
				end
			elseif rc:match("^[aiueo]$") then
				if not options.hist and not options.phonetic and d("tr_long", r_lastlast .. rc) and not result[i_last]:match("[aiueo][aiueo]$") then
					result[i_last] = result[i_last] .. rc
					rc = ""
				end
			end
		end
		
		insert(result, rc)
		result_sp[#result] = rc_sp
	end
	
	if not options.hist then --isolated はへ
		local i_last = getlast()
		if d("flag_hahe", result_sp[i_last]) and getlast(i_last - 1) == 0 then
			result[i_last] = result_sp[i_last]
		end
	end
	
	local has_gem = false
	for i, v in ipairs(result) do
		--gemination
		if has_gem then
			local apos, consonant, remainder = v:match("^(" .. glottal .. "*)([" .. cons .. "yw]+)(.*)")
			if consonant then
				local init, c_gem = apos .. consonant
				while true do
					c_gem = d("tr_gem", init)
					if #init == 1 or not init:match("[yw]$") then
						break
					end
					init = init:sub(1, -2)
				end
				c_gem = c_gem or init:sub(1, 1)
				v = consonant .. remainder
				local i_gem = getlast(i)
				while true do
					i_gem = getlast(i_gem - 1)
					if result_sp[i_gem] == "gem" then
						result[i_gem] = c_gem
					elseif result_sp[i_gem] ~= "allow gem" then
						i_gem = getlast(i_gem + 1)
						result[i_gem] = apos .. result[i_gem]
						break
					end
				end
				has_gem = false
			end
		elseif result_sp[i] == "gem" then
			has_gem = true
		end
		
		-- FIXME: ng/nw should be determined automatically by a disambiguation model.
		local v_first = v:match("^[aiueoyw]") or v:match("^n[gw]")
		if v_first then
			local i_last
			if v_first == "y" or v_first == "w" or v_first == "ng" or v_first == "nw" then
				i_last = getlast(i - 1, function(index)
					local res, res_sp = result[index], result_sp[index]
					return res ~= "" and res ~= "." and res_sp ~= "'" and res_sp ~= "gem"
				end, function() end)
			else
				i_last = getlast(i - 1, nil, function() end)
			end
			if v_first:sub(1, 1) == "n" then
				if umatch(result[i_last], "%a") and not (v_first == "nw" and result[i_last]:match("n$")) then
					v = disambig .. v
				end
			elseif result_sp[i_last] == "coda" then
				local coda = d("tr_coda_apos", v_first, result[i_last])
				if coda == nil or options.hist and coda == "hist" then
					v = disambig .. v
				end
			end
		end
		
		--diacritics (long vowels and others)
		if not options.no_diacritics then
			v = v:gsub("[aiueo][aiueo%A]*", d("tr_long"))
		end
		
		result[i] = v
	end
	
	local num_cap = 0
	for i, v in ipairs(result) do
		--uppercase
		if result_sp[i] == "cap" then
			num_cap = num_cap + 1
		end
		if num_cap > 0 then
			result[i] = v:gsub(".[\128-\191]*", function(c)
				if num_cap <= 0 then return c end
				local uc = c:uupper()
				if c ~= uc then num_cap = num_cap - 1 end
				return uc
			end)
		end
	end
	
	return (concat(result):gsub("[" .. glottal .. disambig .. "]", "'"))
end

return export