Modulo:undecorate
Salti al navigilo
Salti al serĉilo
![]() | ||
Memtesto ne disponeblas. |
- prilaboras signoĉenon konvertante certajn latinajn signojn kun ĉapeloj al krudaj latinaj literoj "A"..."Z" kaj "a"..."z"
- uzata en
{{sendekoraciigo}}
,{{la-ligilo}}
--[===[
MODULE "UNDECORATE" (undecorate)
"eo.wiktionary.org/wiki/Modulo:undecorate" <!--2023-Jan-19-->
Purpose: processes a string converting certain decorated Latin
letters to a raw Latin ASCII letters "A"..."Z" and "a"..."z"
Utilo: prilaboras signocxenon konvertante certajn dekoracihavajn latinajn
literojn al krudaj latinaj askiaj literoj "A"..."Z" kaj "a"..."z"
Manfaat: mengonversi sebuah string ...
Syfte: bearbetar en straeng genom att konvertera vissa dekorerade latinska
bokstaever till raaa latinska ASCII bokstaever "A"..."Z" och "a"..."z"
Used by templates / Uzata far sxablonoj:
- "SXablono:sendekoraciigo" <- "SXablono:deveno3" "SXablono:t"
"SXablono:alilivivo"
- "SXablono:la-ligilo"
Required submodules / Bezonataj submoduloj / Submodul yang diperlukan: none
This module can accept parameters whether sent to itself (own frame) or
to the caller (caller's frame). If there is a parameter "caller=true"
on the own frame then that own frame is discarded in favor of the
caller's one.
Parameters: * 1 anonymous and obligatory parameter
* UTF8 text (empty legal)
* 1 anonymous and optional parameter
* "1" to include the optional chars (by default
they are kept unchanged)
Returned: * ASCII text (empty can occur)
This module is unbreakable (when called with correct module name
and function name).
Cxi tiu modulo estas nerompebla (kiam vokita kun gxustaj nomo de modulo
kaj nomo de funkcio).
Special diagnostic strings in output:
* "XXX" -- truncated or invalid UTF8 stream in input
* "YYY" -- 4-oct in input
* "ZZZ" -- internal error, broken "contabua"
Format of data inside "contabua":
* sequence of line subblocks
* UINT8 : target ASCII code
* n * UINT8 : sequence of source UTF8 char:s, b7 ZERO in last subsequent
octet means that the char is optional (ZERO shifts the
range from $80...$BF down to $00...$3F, conflict is
impossible since we have the length in advance seized from
beginning octet), the sequence of UTF8 char:s ends when
either a valid ASCII letter is encountered (coming from next
line, or ASCII "5" as the final termination value)
* ASCII "5" ($35) as termination value
]===]
local exporttable = {}
------------------------------------------------------------------------
---- CONSTANTS [O] ----
------------------------------------------------------------------------
local contabua = {
[0]=65,195,129,195,128,195,130,195,131,195,5,195,4,196,128,196,130,
97,195,161,195,160,195,162,195,163,195,37,195,36,196,129,196,131,
67,195,135,196,134,196,8,
99,195,167,196,135,196,9,
69,196,146,196,148,195,139,195,137,195,138,
101,196,147,196,149,195,171,195,169,195,170,
71,196,28,
103,196,29,
72,196,36,
104,196,37,
73,196,170,196,172,195,143,195,141,
105,196,171,196,173,195,175,195,173,
74,196,52,
106,196,53,
78,195,145,
110,195,177,
79,197,140,197,142,195,147,195,148,195,149,195,22,197,144,
111,197,141,197,143,195,179,195,180,195,181,195,54,197,145,
83,197,28,
115,197,29,
85,197,170,197,172,195,156,195,154,195,153,197,174,197,176,
117,197,171,197,173,195,188,195,186,195,185,197,175,197,177,
89,197,184,200,178,
121,195,191,200,179,
90,197,189,
122,197,190,
53
}
------------------------------------------------------------------------
---- MATH FUNCTIONS [E] ----
------------------------------------------------------------------------
-- Local function MATHDIV
local function mathdiv (xdividend, xdivisor)
local resultdiv = 0 -- DIV operator lacks in LUA :-(
resultdiv = math.floor (xdividend / xdivisor)
return resultdiv
end--function mathdiv
-- Local function MATHMOD
local function mathmod (xdividendo, xdivisoro)
local resultmod = 0 -- MOD operator is "%" and bitwise AND operator lack too
resultmod = xdividendo % xdivisoro
return resultmod
end--function mathmod
------------------------------------------------------------------------
-- Local function MATHBITTEST
-- Find out whether single bit selected by ZERO-based index is "1" / "true".
-- Result has type "boolean".
-- Depends on functions :
-- [E] mathdiv mathmod
local function mathbittest (numincoming, numbitindex)
local boores = false
while true do
if ((numbitindex==0) or (numincoming==0)) then
break -- we have either reached our bit or run out of bits
end--if
numincoming = mathdiv(numincoming,2) -- shift right
numbitindex = numbitindex - 1 -- count down to ZERO
end--while
boores = (mathmod(numincoming,2)==1) -- pick bit
return boores
end--function mathbittest
------------------------------------------------------------------------
---- UTF8 FUNCTIONS [U] ----
------------------------------------------------------------------------
-- Local function LFULNUTF8CHAR
-- Evaluate length of a single UTF8 char in octet:s.
-- Input : * numbgoctet -- beginning octet of a UTF8 char
-- Output : * numlen1234x -- number 1...4 or ZERO if invalid
-- Does NOT thoroughly check the validity, looks at 1 octet only.
local function lfulnutf8char (numbgoctet)
local numlen1234x = 0
if (numbgoctet<128) then
numlen1234x = 1 -- $00...$7F -- ANSI/ASCII
end--if
if ((numbgoctet>=194) and (numbgoctet<=223)) then
numlen1234x = 2 -- $C2 to $DF
end--if
if ((numbgoctet>=224) and (numbgoctet<=239)) then
numlen1234x = 3 -- $E0 to $EF
end--if
if ((numbgoctet>=240) and (numbgoctet<=244)) then
numlen1234x = 4 -- $F0 to $F4
end--if
return numlen1234x
end--function lfulnutf8char
------------------------------------------------------------------------
---- HIGH LEVEL FUNCTIONS [H] ----
------------------------------------------------------------------------
-- Local function LFHSPLITB7NOW
-- Split a UINT8 into b0...b6 (b7 always ONE) and separate b7 as boolean.
local function lfhsplitb7now (nummain)
local boob7bit = false
boob7bit = mathbittest (nummain,7)
if (not boob7bit) then
nummain = nummain + 128
end--if
return nummain,boob7bit
end--function lfhsplitb7now
------------------------------------------------------------------------
-- Local function LFDOKONV
-- Try to convert ie undecorate one UTF8 char.
-- Input : * numinutflen -- 2 or 3
-- * booinkall -- "true" to convert the optional excluded ones too
-- Output : * strdukonv -- 4 possibilities
-- Depends on functions :
-- [H] lfhsplitb7now
-- [U] lfulnutf8char
-- [E] mathbittest mathdiv mathmod
-- Depends on constants :
-- * table "contabua" -- length is unknown but the end is marked with value 53
-- Note that "numinutflen" can be 2 or 3 only, do NOT call this otherwise.
-- Note the inverted meaning of "b7" after it is picked (from "lfhsplitb7now"):
-- * ONE default -- char is NOT optional
-- * ZERO special -- char is optional and excluded from conversion unless
-- "booinkall" requests inclusion
local function lfdokonv (numinutflen,numvlsrc0,numvlsrc1,numvlsrc2,booinkall)
local strdukonv = ''
local numresult = 0 -- 1 bad "YYY" 2 bad "ZZZ" 3 unchanged 4 YES replacement
local numdestascii = 0
local numsrindex = 0
local numpeeker0 = 0
local numpeeker1 = 0
local numpeeker2 = 0
local numpanjang = 0 -- only 2 or 3 this is peeked as oppo to "numinutflen"
local boonexk = false -- ZERO ie "false" if the char is optional
if ((numinutflen~=2) and (numinutflen~=3)) then
numresult = 1 -- "YYY"
end--if
while (true) do -- search "contabua" -- over destination ASCII chars
if (numresult~=0) then
break -- outer loop -- have a result
end--if
numdestascii = contabua[numsrindex] -- must be ASCII
if ((numdestascii<65) or (numdestascii>122)) then
numresult = 2 -- "ZZZ"
break -- outer loop -- abort search due to broken static data
end--if
numsrindex = numsrindex + 1
while (true) do -- inner loop -- search "contabua" -- over src UTF8 chars
numpeeker0 = contabua[numsrindex]
if (numpeeker0<128) then -- next ASCII or terminator
if (numpeeker0==53) then
numresult = 3 -- no match, found terminator instead, keep unchanged
end--if
break -- inner loop only, repeek and reevaluate for outer loop
end--if
numsrindex = numsrindex + 1 -- do this AFTER check against ASCII !!!
numpanjang = lfulnutf8char (numpeeker0)
if ((numpanjang~=2) and (numpanjang~=3)) then
numresult = 2 -- "ZZZ"
break -- abort search due to broken static data
end--if
numpeeker1 = contabua[numsrindex]
numpeeker1,boonexk = lfhsplitb7now (numpeeker1)
numsrindex = numsrindex + 1
if (numpanjang==3) then
numpeeker2 = contabua[numsrindex]
numpeeker2,boonexk = lfhsplitb7now (numpeeker2) -- overwrite "boonexk"
numsrindex = numsrindex + 1
end--if
while (true) do -- fake loop
if (numpanjang~=numinutflen) then
break -- wrong length, miss -- to join mark
end--if
if ((numpeeker0~=numvlsrc0) or (numpeeker1~=numvlsrc1)) then
break -- miss -- to join mark
end--if
if ((numpanjang==3) and (numpeeker2~=numvlsrc2)) then
break -- miss -- to join mark
end--if
if (boonexk or booinkall) then
numresult = 4 -- YES replacement, take the hit
else
numresult = 3 -- unchanged, discard the hit
end--if
break -- finally to join mark
end--while -- fake loop -- join mark
if (numresult~=0) then
break -- have a result -- abort inner loop
end--if
end--while -- inner loop -- search "contabua" -- over src UTF8 chars
if (numresult~=0) then
break -- have a result -- abort outer loop too
end--if
end--while -- search "contabua" -- over destination ASCII chars
if (numresult==1) then -- bad "numinutflen"
strdukonv = "YYY"
end--if
if (numresult==2) then -- bad "contabua"
strdukonv = "ZZZ"
end--if
if (numresult==3) then -- unchanged
if (numinutflen==3) then
strdukonv = string.char(numvlsrc0,numvlsrc1,numvlsrc2)
else
strdukonv = string.char(numvlsrc0,numvlsrc1)
end--if
end--if
if (numresult==4) then -- replace
strdukonv = string.char(numdestascii)
end--if
return strdukonv
end--function lfdokonv
------------------------------------------------------------------------
---- VARIABLES [R] ----
------------------------------------------------------------------------
function exporttable.ek (arxframent)
-- special type "args" AKA "arx"
local arxsomons = 0 -- metaized "args" from our own or caller's "frame"
-- general "str"
local strintxt = "" -- input string from [0]
local strret = "" -- result string
-- general "num"
local numsrclen = 0
local numsrcind = 0
local numvalsrc0 = 0
local numvalsrc1 = 0
local numvalsrc2 = 0
local numvalsrle = 0
-- general "boo"
local booalsoeks = false -- from [1]
------------------------------------------------------------------------
---- MAIN [Z] ----
------------------------------------------------------------------------
---- GET THE ARX (ONE OF TWO) ----
arxsomons = arxframent.args -- "args" from our own "frame"
if (type(arxsomons)~="table") then
arxsomons = {} -- guard against indexing error from our own
end--if
if (arxsomons['caller']=="true") then
arxsomons = arxframent:getParent().args -- "args" from caller's "frame"
end--if
if (type(arxsomons)~="table") then
arxsomons = {} -- guard against indexing error again
end--if
---- GET THE PARAMETERS ----
strintxt = arxsomons[1]
if (type(strintxt)~="string") then
strintxt = ""
end--if
booalsoeks = (arxsomons[2]=="1") -- bool -- "1" AKA "true" to take excluded too
---- CARRY OUT THE HARD WORK ----
local function xxpick () -- upvalues "strintxt" (const) and "numsrcind" (upd)
local xx = 0
xx = string.byte (strintxt,(numsrcind+1),(numsrcind+1))
numsrcind = numsrcind + 1
return xx
end--function xxpick
numsrclen = string.len (strintxt)
numsrcind = 0 -- ZERO-based
strret = ""
while (true) do
if (numsrcind>=numsrclen) then
break -- outer loop -- done
end--if
numvalsrc0 = xxpick ()
numvalsrle = lfulnutf8char (numvalsrc0)
if (numvalsrle==1) then
strret = strret .. string.char (numvalsrc0) -- do not even attempt to
else
if (((numvalsrle~=2) and (numvalsrle~=3)) or ((numsrcind+numvalsrle-1)>numsrclen)) then
strret = strret .. "XXX" -- truncated or invalid UTF8 stream in input
break -- outer loop
end--if
numvalsrc1 = xxpick () -- ZERO and ONE sudah excluded, must be 2 or 3
if (numvalsrle==3) then
numvalsrc2 = xxpick () -- !!!FIXME!!! 4-oct does NOT work yet
end--if
strret = strret .. lfdokonv (numvalsrle,numvalsrc0,numvalsrc1,numvalsrc2,booalsoeks)
end--if
end--while
---- RETURN THE JUNK STRING ----
return strret -- can be empty
end--function
---- RETURN THE JUNK LUA TABLE ----
return exporttable