Modulo:undecorate

El Vikivortaro
 MODULO
Memtesto ne disponeblas.
  • prilaboras signoĉenon konvertante certajn latinajn signojn kun ĉapeloj al krudaj latinaj literoj "A"..."Z" kaj "a"..."z"
  • uzata en {{sendekoraciigo}}, {{la-ligilo}}

--[===[

MODULE "UNDECORATE" (undecorate)

"eo.wiktionary.org/wiki/Modulo:undecorate" <!--2023-Apr-19-->

Purpose: processes a string converting certain decorated Latin
         letters to a raw Latin ASCII letters "A"..."Z" and "a"..."z"

Utilo: prilaboras signocxenon konvertante certajn dekoracihavajn latinajn
       literojn al krudaj latinaj askiaj literoj "A"..."Z" kaj "a"..."z"

Manfaat: mengonversi sebuah string ...

Syfte: bearbetar en straeng genom att konvertera vissa dekorerade latinska
       bokstaever till raaa latinska ASCII bokstaever "A"..."Z" och "a"..."z"

Used by templates / Uzata far sxablonoj:
- "SXablono:sendekoraciigo" <- "SXablono:deveno3" "SXablono:t"
                               "SXablono:alilivivo"
- "SXablono:la-ligilo"

Required submodules / Bezonataj submoduloj / Submodul yang diperlukan: none

This module can accept parameters whether sent to itself (own frame) or
to the caller (caller's frame). If there is a parameter "caller=true"
on the own frame then that own frame is discarded in favor of the
caller's one.

Parameters: * 1 anonymous and obligatory parameter
              * UTF8 text (empty legal)
            * 1 anonymous and optional parameter
              * "1" to attack the additional chars (by default
                they are kept unchanged)

Returned: * ASCII text (empty can occur)

This module is unbreakable (when called with correct module name
and function name).

Cxi tiu modulo estas nerompebla (kiam vokita kun gxustaj nomo de modulo
kaj nomo de funkcio).

Special diagnostic strings in output:
* "XXX" -- truncated or invalid UTF8 stream in input
* "YYY" -- 4-oct in input
* "ZZZ" -- internal error, broken "contabudkonv"

Format of data inside "contabudkonv":
* sequence of line subblocks
  * UINT8 : target ASCII code
  * n * UINT8 : sequence of source UTF8 char:s, b7 ZERO in last subsequent
                octet means that the char is additional (ZERO shifts
                the range from $80...$BF down to $00...$3F, conflict is
                impossible since we have the length in advance seized from
                beginning octet), the sequence of UTF8 char:s ends when
                either a valid ASCII letter is encountered (coming from
                next line, or ASCII "5" as the final termination value)
* ASCII "5" ($35) as termination value

]===]

local exporttable = {}

------------------------------------------------------------------------

---- CONSTANTS [O] ----

------------------------------------------------------------------------

  local contabudkonv = {
    [0]=65,195,129,195,128,195,130,195,131,195,5,195,4,196,128,196,130,
    97,195,161,195,160,195,162,195,163,195,37,195,36,196,129,196,131,
    67,195,135,196,134,196,8,
    99,195,167,196,135,196,9,
    69,196,146,196,148,195,139,195,137,195,138,
    101,196,147,196,149,195,171,195,169,195,170,
    71,196,28,
    103,196,29,
    72,196,36,
    104,196,37,
    73,196,170,196,172,195,143,195,141,
    105,196,171,196,173,195,175,195,173,
    74,196,52,
    106,196,53,
    78,195,145,
    110,195,177,
    79,197,140,197,142,195,147,195,148,195,149,195,22,197,144,
    111,197,141,197,143,195,179,195,180,195,181,195,54,197,145,
    83,197,28,
    115,197,29,
    85,197,170,197,172,195,156,195,154,195,153,197,174,197,176,
    117,197,171,197,173,195,188,195,186,195,185,197,175,197,177,
    89,197,184,200,178,
    121,195,191,200,179,
    90,197,189,
    122,197,190,
    53
    }

------------------------------------------------------------------------

---- MATH FUNCTIONS [E] ----

------------------------------------------------------------------------

-- Local function MATHDIV

local function mathdiv (xdividens, xdivisero)
  local resultdiv = 0 -- DIV operator lacks in LUA :-(
  resultdiv = math.floor (xdividens / xdivisero)
  return resultdiv
end--function mathdiv

-- Local function MATHMOD

local function mathmod (xdividendo, xdivisoro)
  local resultmod = 0 -- MOD operator is "%" and bitwise AND operator lack too
  resultmod = xdividendo % xdivisoro
  return resultmod
end--function mathmod

------------------------------------------------------------------------

-- Local function MATHBITTEST

-- Find out whether single bit selected by ZERO-based index is "1" / "true".

-- Result has type "boolean".

-- Depends on functions :
-- [E] mathdiv mathmod

local function mathbittest (numincoming, numbitindex)
  local boores = false
  while true do
    if ((numbitindex==0) or (numincoming==0)) then
      break -- we have either reached our bit or run out of bits
    end--if
    numincoming = mathdiv(numincoming,2) -- shift right
    numbitindex = numbitindex - 1 -- count down to ZERO
  end--while
  boores = (mathmod(numincoming,2)==1) -- pick bit
  return boores
end--function mathbittest

------------------------------------------------------------------------

---- UTF8 FUNCTIONS [U] ----

------------------------------------------------------------------------

-- Local function LFULNUTF8CHAR

-- Evaluate length of a single UTF8 char in octet:s.

-- Input  : * numbgoctet  -- beginning octet of a UTF8 char

-- Output : * numlen1234x -- number 1...4 or ZERO if invalid

-- Does NOT thoroughly check the validity, looks at 1 octet only.

local function lfulnutf8char (numbgoctet)
  local numlen1234x = 0
    if (numbgoctet<128) then
      numlen1234x = 1 -- $00...$7F -- ANSI/ASCII
    end--if
    if ((numbgoctet>=194) and (numbgoctet<=223)) then
      numlen1234x = 2 -- $C2 to $DF
    end--if
    if ((numbgoctet>=224) and (numbgoctet<=239)) then
      numlen1234x = 3 -- $E0 to $EF
    end--if
    if ((numbgoctet>=240) and (numbgoctet<=244)) then
      numlen1234x = 4 -- $F0 to $F4
    end--if
  return numlen1234x
end--function lfulnutf8char

------------------------------------------------------------------------

-- Local function LFUSPLITB7NOW

-- Split a UINT8 into b0...b6 (b7 is always ONE) and separate b7 as boolean.

-- Called only from lfudokonv.

local function lfusplitb7now (nummain)
  local boob7bit = false
  boob7bit = mathbittest (nummain,7)
  if (not boob7bit) then
    nummain = nummain + 128
  end--if
  return nummain,boob7bit
end--function lfusplitb7now

------------------------------------------------------------------------

-- Local function LFUDOKONV

-- Try to convert ie undecorate one UTF8 char.

-- Input  : * numinutf5len -- 2 or 3
--          * booalso5addi -- "true" to convert the additional ones too

-- Output : * strdukonv -- 4 possibilities

-- Depends on functions :
-- [U] lfulnutf8char lfusplitb7now
-- [E] mathbittest mathdiv mathmod

-- Depends on constants :
-- * table "contabudkonv" -- length is unknown but the end is
--                           marked with value 53

-- Called only from lfuremovedeko.

-- Note that "numinutf5len" can be 2 or 3 only, do NOT call this otherwise.

-- Note the inverted meaning of "b7" after split in "lfusplitb7now":
-- * ONE default  -- char is base ie NOT additional
-- * ZERO special -- char is additional and excluded from conversion unless
--                   "booalso5addi" requests attack

local function lfudokonv (numinutf5len,numvlsrc0,numvlsrc1,numvlsrc2,booalso5addi)

  local strdukonv = ''
  local numresult = 0 -- 1 bad "YYY" 2 bad "ZZZ" 3 unchanged 4 YES replacement
  local numdestascii = 0
  local numsrindex = 0
  local numpeeker0 = 0
  local numpeeker1 = 0
  local numpeeker2 = 0
  local numpanjang = 0 -- only 2 or 3 this is peeked as oppo to "numinutf5len"
  local boonexk = false -- ZERO ie "false" if the char is additional

  if ((numinutf5len~=2) and (numinutf5len~=3)) then
    numresult = 1 -- "YYY"
  end--if

  while true do -- search "contabudkonv" -- over destination ASCII chars

    if (numresult~=0) then
      break -- outer loop -- have a result
    end--if

    numdestascii = contabudkonv[numsrindex] -- must be ASCII
    if ((numdestascii<65) or (numdestascii>122)) then
      numresult = 2 -- "ZZZ"
      break -- outer loop -- abort search due to broken static data
    end--if
    numsrindex = numsrindex + 1

    while true do -- inner loop -- search "contabudkonv" -- over src UTF8 ch
      numpeeker0 = contabudkonv[numsrindex]
      if (numpeeker0<128) then -- next ASCII or terminator
        if (numpeeker0==53) then
          numresult = 3 -- no match, found terminator instead, keep unchanged
        end--if
        break -- inner loop only, repeek and reevaluate for outer loop
      end--if
      numsrindex = numsrindex + 1 -- do this AFTER check against ASCII !!!
      numpanjang = lfulnutf8char (numpeeker0)
      if ((numpanjang~=2) and (numpanjang~=3)) then
        numresult = 2 -- "ZZZ"
        break -- abort search due to broken static data
      end--if
      numpeeker1 = contabudkonv[numsrindex]
      numpeeker1,boonexk = lfusplitb7now (numpeeker1)
      numsrindex = numsrindex + 1
      if (numpanjang==3) then
        numpeeker2 = contabudkonv[numsrindex]
        numpeeker2,boonexk = lfusplitb7now (numpeeker2) -- overwrite "boonexk"
        numsrindex = numsrindex + 1
      end--if
      while true do -- fake loop
        if (numpanjang~=numinutf5len) then
          break -- wrong length, miss -- to join mark
        end--if
        if ((numpeeker0~=numvlsrc0) or (numpeeker1~=numvlsrc1)) then
          break -- miss -- to join mark
        end--if
        if ((numpanjang==3) and (numpeeker2~=numvlsrc2)) then
          break -- miss -- to join mark
        end--if
        if (boonexk or booalso5addi) then
          numresult = 4 -- YES replacement, take the hit
        else
          numresult = 3 -- unchanged, discard the hit
        end--if
        break -- finally to join mark
      end--while -- fake loop -- join mark
      if (numresult~=0) then
        break -- have a result -- abort inner loop
      end--if
    end--while -- inner loop -- search "contabudkonv" -- over src UTF8 ch

    if (numresult~=0) then
      break -- have a result -- abort outer loop too
    end--if

  end--while -- search "contabudkonv" -- over destination ASCII chars

  if (numresult==1) then -- bad "numinutf5len"
    strdukonv = "YYY"
  end--if
  if (numresult==2) then -- bad "contabudkonv"
    strdukonv = "ZZZ"
  end--if
  if (numresult==3) then -- unchanged
    if (numinutf5len==3) then
      strdukonv = string.char(numvlsrc0,numvlsrc1,numvlsrc2)
    else
      strdukonv = string.char(numvlsrc0,numvlsrc1)
    end--if
  end--if
  if (numresult==4) then -- replace
    strdukonv = string.char(numdestascii)
  end--if

  return strdukonv

end--function lfudokonv

------------------------------------------------------------------------

-- Local function LFUREMOVEDEKO

-- Remove decorations from Latin characters in a string. There is a base set
-- of chars always attacked, and an additional set attacked only if requested
-- by a boolean variable.

-- Input  : * strdedekrin -- empty is useless but cannot cause major harm
--          * booalso6addi -- "true" to attack the additional chars too
--                            (by default they are left unchanged)

-- Output : * strautput -- either same number of UTF8 char:s but usually less
--                         octet:s, or "XXX" "YYY" "ZZZ"

-- Depends on functions :
-- [U] lfulnutf8char lfusplitb7now lfudokonv
-- [E] mathbittest mathdiv mathmod

-- Depends on constants :
-- * table "contabudkonv" -- length is unknown but the end is
--                           marked with value 53

-- Special diagnostic strings in output:
-- * "XXX" -- truncated or invalid UTF8 stream in input
-- * "YYY" -- 4-oct in input
-- * "ZZZ" -- internal error, broken "contabudkonv"

-- Format of data inside "contabudkonv":
-- * sequence of line subblocks
--   * UINT8 : target ASCII code
--   * n * UINT8 : sequence of source UTF8 char:s, b7 ZERO in last subsequent
--                 octet means that the char is additional (ZERO shifts
--                 the range from $80...$BF down to $00...$3F, conflict is
--                 impossible since we have the length in advance seized from
--                 beginning octet), the sequence of UTF8 char:s ends when
--                 either a valid ASCII letter is encountered (coming from
--                 next line, or ASCII "5" as the final termination value)
-- * ASCII "5" ($35) as termination value

local function lfuremovedeko (strdedekrin, booalso6addi)

  local strautput = ''
  local numsrclen = 0
  local numsrcind = 0
  local numvalsrle = 0
  local numvalsrc0 = 0 -- 1-oct ANSI/ASCII
  local numvalsrc1 = 0 -- 2-oct
  local numvalsrc2 = 0 -- 3-oct !!!FIXME!!! 4-oct does NOT work yet

    local function xxlodsb () -- upvalues "strdedekrin" (const) and "numsrcind" (upd)
      local xx = 0
      xx = string.byte (strdedekrin,(numsrcind+1),(numsrcind+1))
      numsrcind = numsrcind + 1
      return xx
    end--function xxlodsb

  numsrclen = string.len (strdedekrin)
  numsrcind = 0 -- ZERO-based
  while true do
    if (numsrcind>=numsrclen) then
      break -- done
    end--if
    numvalsrc0 = xxlodsb ()
    numvalsrle = lfulnutf8char (numvalsrc0)
    if (numvalsrle==1) then
      strautput = strautput .. string.char (numvalsrc0) -- do not even attempt
    else
      if (((numvalsrle~=2) and (numvalsrle~=3)) or ((numsrcind+numvalsrle-1)>numsrclen)) then
        strautput = strautput .. "XXX" -- truncated or invalid UTF8 stream in input
        break -- outer loop
      end--if
      numvalsrc1 = xxlodsb () -- ZERO and ONE sudah excluded, must be 2 or 3
      if (numvalsrle==3) then
        numvalsrc2 = xxlodsb () -- 3-oct !!!FIXME!!! 4-oct does NOT work yet
      end--if
      strautput = strautput .. lfudokonv (numvalsrle,numvalsrc0,numvalsrc1,numvalsrc2,booalso6addi)
    end--if
  end--while

  return strautput

end--function lfuremovedeko

------------------------------------------------------------------------

---- VARIABLES [R] ----

------------------------------------------------------------------------

function exporttable.ek (arxframent)

  -- special type "args" AKA "arx"

  local arxsomons   = 0 -- metaized "args" from our own or caller's "frame"

  -- general "str"

  local strintxt    = ''  -- input string from [0]
  local strret      = ''  -- result string

  -- general "boo"

  local booalso7edd = false -- from [1]

------------------------------------------------------------------------

---- MAIN [Z] ----

------------------------------------------------------------------------

  ---- GET THE ARX (ONE OF TWO) ----

  arxsomons = arxframent.args -- "args" from our own "frame"
  if (type(arxsomons)~="table") then
    arxsomons = {} -- guard against indexing error from our own
  end--if
  if (arxsomons['caller']=="true") then
    arxsomons = arxframent:getParent().args -- "args" from caller's "frame"
  end--if
  if (type(arxsomons)~="table") then
    arxsomons = {} -- guard against indexing error again
  end--if

  ---- GET THE PARAMETERS ----

  strintxt = arxsomons[1]
  if (type(strintxt)~='string') then
    strintxt = ''
  end--if
  booalso7edd = (arxsomons[2]=='1') -- "1" AKA "true" to attack additional

  ---- CARRY OUT THE HARD WORK ----

  strret = lfuremovedeko (strintxt, booalso7edd)

  ---- RETURN THE JUNK STRING ----

  return strret -- can happen to be be empty

end--function

  ---- RETURN THE JUNK LUA TABLE ----

return exporttable