Modulo:undecorate

El Vikivortaro
Salti al navigilo Salti al serĉilo
 MODULO
Memtesto ne disponeblas.
  • prilaboras signoĉenon konvertante certajn latinajn signojn kun ĉapeloj al krudaj latinaj literoj "A"..."Z" kaj "a"..."z"
  • uzata en {{sendekoraciigo}}, {{la-ligilo}}

--[===[

MODULE "UNDECORATE" (undecorate)

"eo.wiktionary.org/wiki/Modulo:undecorate" <!--2023-Jan-19-->

Purpose: processes a string converting certain decorated Latin
         letters to a raw Latin ASCII letters "A"..."Z" and "a"..."z"

Utilo: prilaboras signocxenon konvertante certajn dekoracihavajn latinajn
       literojn al krudaj latinaj askiaj literoj "A"..."Z" kaj "a"..."z"

Manfaat: mengonversi sebuah string ...

Syfte: bearbetar en straeng genom att konvertera vissa dekorerade latinska
       bokstaever till raaa latinska ASCII bokstaever "A"..."Z" och "a"..."z"

Used by templates / Uzata far sxablonoj:
- "SXablono:sendekoraciigo" <- "SXablono:deveno3" "SXablono:t"
                               "SXablono:alilivivo"
- "SXablono:la-ligilo"

Required submodules / Bezonataj submoduloj / Submodul yang diperlukan: none

This module can accept parameters whether sent to itself (own frame) or
to the caller (caller's frame). If there is a parameter "caller=true"
on the own frame then that own frame is discarded in favor of the
caller's one.

Parameters: * 1 anonymous and obligatory parameter
              * UTF8 text (empty legal)
            * 1 anonymous and optional parameter
              * "1" to include the optional chars (by default
                they are kept unchanged)

Returned: * ASCII text (empty can occur)

This module is unbreakable (when called with correct module name
and function name).

Cxi tiu modulo estas nerompebla (kiam vokita kun gxustaj nomo de modulo
kaj nomo de funkcio).

Special diagnostic strings in output:
* "XXX" -- truncated or invalid UTF8 stream in input
* "YYY" -- 4-oct in input
* "ZZZ" -- internal error, broken "contabua"

Format of data inside "contabua":
* sequence of line subblocks
  * UINT8 : target ASCII code
  * n * UINT8 : sequence of source UTF8 char:s, b7 ZERO in last subsequent
                octet means that the char is optional (ZERO shifts the
                range from $80...$BF down to $00...$3F, conflict is
                impossible since we have the length in advance seized from
                beginning octet), the sequence of UTF8 char:s ends when
                either a valid ASCII letter is encountered (coming from next
                line, or ASCII "5" as the final termination value)
* ASCII "5" ($35) as termination value

]===]

local exporttable = {}

------------------------------------------------------------------------

---- CONSTANTS [O] ----

------------------------------------------------------------------------

  local contabua = {
    [0]=65,195,129,195,128,195,130,195,131,195,5,195,4,196,128,196,130,
    97,195,161,195,160,195,162,195,163,195,37,195,36,196,129,196,131,
    67,195,135,196,134,196,8,
    99,195,167,196,135,196,9,
    69,196,146,196,148,195,139,195,137,195,138,
    101,196,147,196,149,195,171,195,169,195,170,
    71,196,28,
    103,196,29,
    72,196,36,
    104,196,37,
    73,196,170,196,172,195,143,195,141,
    105,196,171,196,173,195,175,195,173,
    74,196,52,
    106,196,53,
    78,195,145,
    110,195,177,
    79,197,140,197,142,195,147,195,148,195,149,195,22,197,144,
    111,197,141,197,143,195,179,195,180,195,181,195,54,197,145,
    83,197,28,
    115,197,29,
    85,197,170,197,172,195,156,195,154,195,153,197,174,197,176,
    117,197,171,197,173,195,188,195,186,195,185,197,175,197,177,
    89,197,184,200,178,
    121,195,191,200,179,
    90,197,189,
    122,197,190,
    53
    }

------------------------------------------------------------------------

---- MATH FUNCTIONS [E] ----

------------------------------------------------------------------------

-- Local function MATHDIV

local function mathdiv (xdividend, xdivisor)
  local resultdiv = 0 -- DIV operator lacks in LUA :-(
  resultdiv = math.floor (xdividend / xdivisor)
  return resultdiv
end--function mathdiv

-- Local function MATHMOD

local function mathmod (xdividendo, xdivisoro)
  local resultmod = 0 -- MOD operator is "%" and bitwise AND operator lack too
  resultmod = xdividendo % xdivisoro
  return resultmod
end--function mathmod

------------------------------------------------------------------------

-- Local function MATHBITTEST

-- Find out whether single bit selected by ZERO-based index is "1" / "true".

-- Result has type "boolean".

-- Depends on functions :
-- [E] mathdiv mathmod

local function mathbittest (numincoming, numbitindex)
  local boores = false
  while true do
    if ((numbitindex==0) or (numincoming==0)) then
      break -- we have either reached our bit or run out of bits
    end--if
    numincoming = mathdiv(numincoming,2) -- shift right
    numbitindex = numbitindex - 1 -- count down to ZERO
  end--while
  boores = (mathmod(numincoming,2)==1) -- pick bit
  return boores
end--function mathbittest

------------------------------------------------------------------------

---- UTF8 FUNCTIONS [U] ----

------------------------------------------------------------------------

-- Local function LFULNUTF8CHAR

-- Evaluate length of a single UTF8 char in octet:s.

-- Input  : * numbgoctet  -- beginning octet of a UTF8 char

-- Output : * numlen1234x -- number 1...4 or ZERO if invalid

-- Does NOT thoroughly check the validity, looks at 1 octet only.

local function lfulnutf8char (numbgoctet)
  local numlen1234x = 0
    if (numbgoctet<128) then
      numlen1234x = 1 -- $00...$7F -- ANSI/ASCII
    end--if
    if ((numbgoctet>=194) and (numbgoctet<=223)) then
      numlen1234x = 2 -- $C2 to $DF
    end--if
    if ((numbgoctet>=224) and (numbgoctet<=239)) then
      numlen1234x = 3 -- $E0 to $EF
    end--if
    if ((numbgoctet>=240) and (numbgoctet<=244)) then
      numlen1234x = 4 -- $F0 to $F4
    end--if
  return numlen1234x
end--function lfulnutf8char

------------------------------------------------------------------------

---- HIGH LEVEL FUNCTIONS [H] ----

------------------------------------------------------------------------

-- Local function LFHSPLITB7NOW

-- Split a UINT8 into b0...b6 (b7 always ONE) and separate b7 as boolean.

local function lfhsplitb7now (nummain)
  local boob7bit = false
  boob7bit = mathbittest (nummain,7)
  if (not boob7bit) then
    nummain = nummain + 128
  end--if
  return nummain,boob7bit
end--function lfhsplitb7now

------------------------------------------------------------------------

-- Local function LFDOKONV

-- Try to convert ie undecorate one UTF8 char.

-- Input  : * numinutflen -- 2 or 3
--          * booinkall -- "true" to convert the optional excluded ones too

-- Output : * strdukonv -- 4 possibilities

-- Depends on functions :
-- [H] lfhsplitb7now
-- [U] lfulnutf8char
-- [E] mathbittest mathdiv mathmod

-- Depends on constants :
-- * table "contabua" -- length is unknown but the end is marked with value 53

-- Note that "numinutflen" can be 2 or 3 only, do NOT call this otherwise.

-- Note the inverted meaning of "b7" after it is picked (from "lfhsplitb7now"):
-- * ONE default  -- char is NOT optional
-- * ZERO special -- char is optional and excluded from conversion unless
--                   "booinkall" requests inclusion

local function lfdokonv (numinutflen,numvlsrc0,numvlsrc1,numvlsrc2,booinkall)

  local strdukonv = ''
  local numresult = 0 -- 1 bad "YYY" 2 bad "ZZZ" 3 unchanged 4 YES replacement
  local numdestascii = 0
  local numsrindex = 0
  local numpeeker0 = 0
  local numpeeker1 = 0
  local numpeeker2 = 0
  local numpanjang = 0 -- only 2 or 3 this is peeked as oppo to "numinutflen"
  local boonexk = false -- ZERO ie "false" if the char is optional

  if ((numinutflen~=2) and (numinutflen~=3)) then
    numresult = 1 -- "YYY"
  end--if

  while (true) do -- search "contabua" -- over destination ASCII chars

    if (numresult~=0) then
      break -- outer loop -- have a result
    end--if

    numdestascii = contabua[numsrindex] -- must be ASCII
    if ((numdestascii<65) or (numdestascii>122)) then
      numresult = 2 -- "ZZZ"
      break -- outer loop -- abort search due to broken static data
    end--if
    numsrindex = numsrindex + 1

    while (true) do -- inner loop -- search "contabua" -- over src UTF8 chars
      numpeeker0 = contabua[numsrindex]
      if (numpeeker0<128) then -- next ASCII or terminator
        if (numpeeker0==53) then
          numresult = 3 -- no match, found terminator instead, keep unchanged
        end--if
        break -- inner loop only, repeek and reevaluate for outer loop
      end--if
      numsrindex = numsrindex + 1 -- do this AFTER check against ASCII !!!
      numpanjang = lfulnutf8char (numpeeker0)
      if ((numpanjang~=2) and (numpanjang~=3)) then
        numresult = 2 -- "ZZZ"
        break -- abort search due to broken static data
      end--if
      numpeeker1 = contabua[numsrindex]
      numpeeker1,boonexk = lfhsplitb7now (numpeeker1)
      numsrindex = numsrindex + 1
      if (numpanjang==3) then
        numpeeker2 = contabua[numsrindex]
        numpeeker2,boonexk = lfhsplitb7now (numpeeker2) -- overwrite "boonexk"
        numsrindex = numsrindex + 1
      end--if
      while (true) do -- fake loop
        if (numpanjang~=numinutflen) then
          break -- wrong length, miss -- to join mark
        end--if
        if ((numpeeker0~=numvlsrc0) or (numpeeker1~=numvlsrc1)) then
          break -- miss -- to join mark
        end--if
        if ((numpanjang==3) and (numpeeker2~=numvlsrc2)) then
          break -- miss -- to join mark
        end--if
        if (boonexk or booinkall) then
          numresult = 4 -- YES replacement, take the hit
        else
          numresult = 3 -- unchanged, discard the hit
        end--if
        break -- finally to join mark
      end--while -- fake loop -- join mark
      if (numresult~=0) then
        break -- have a result -- abort inner loop
      end--if
    end--while -- inner loop -- search "contabua" -- over src UTF8 chars

    if (numresult~=0) then
      break -- have a result -- abort outer loop too
    end--if

  end--while -- search "contabua" -- over destination ASCII chars

  if (numresult==1) then -- bad "numinutflen"
    strdukonv = "YYY"
  end--if
  if (numresult==2) then -- bad "contabua"
    strdukonv = "ZZZ"
  end--if
  if (numresult==3) then -- unchanged
    if (numinutflen==3) then
      strdukonv = string.char(numvlsrc0,numvlsrc1,numvlsrc2)
    else
      strdukonv = string.char(numvlsrc0,numvlsrc1)
    end--if
  end--if
  if (numresult==4) then -- replace
    strdukonv = string.char(numdestascii)
  end--if

  return strdukonv

end--function lfdokonv

------------------------------------------------------------------------

---- VARIABLES [R] ----

------------------------------------------------------------------------

function exporttable.ek (arxframent)

  -- special type "args" AKA "arx"

  local arxsomons = 0 -- metaized "args" from our own or caller's "frame"

  -- general "str"

  local strintxt      = ""  -- input string from [0]
  local strret        = ""  -- result string

  -- general "num"

  local numsrclen     = 0
  local numsrcind     = 0
  local numvalsrc0    = 0
  local numvalsrc1    = 0
  local numvalsrc2    = 0
  local numvalsrle    = 0

  -- general "boo"

  local booalsoeks    = false -- from [1]

------------------------------------------------------------------------

---- MAIN [Z] ----

------------------------------------------------------------------------

  ---- GET THE ARX (ONE OF TWO) ----

  arxsomons = arxframent.args -- "args" from our own "frame"
  if (type(arxsomons)~="table") then
    arxsomons = {} -- guard against indexing error from our own
  end--if
  if (arxsomons['caller']=="true") then
    arxsomons = arxframent:getParent().args -- "args" from caller's "frame"
  end--if
  if (type(arxsomons)~="table") then
    arxsomons = {} -- guard against indexing error again
  end--if

  ---- GET THE PARAMETERS ----

  strintxt = arxsomons[1]
  if (type(strintxt)~="string") then
    strintxt = ""
  end--if
  booalsoeks = (arxsomons[2]=="1") -- bool -- "1" AKA "true" to take excluded too

  ---- CARRY OUT THE HARD WORK ----

    local function xxpick () -- upvalues "strintxt" (const) and "numsrcind" (upd)
      local xx = 0
      xx = string.byte (strintxt,(numsrcind+1),(numsrcind+1))
      numsrcind = numsrcind + 1
      return xx
    end--function xxpick

  numsrclen = string.len (strintxt)
  numsrcind = 0 -- ZERO-based
  strret = ""
  while (true) do
    if (numsrcind>=numsrclen) then
      break -- outer loop -- done
    end--if
    numvalsrc0 = xxpick ()
    numvalsrle = lfulnutf8char (numvalsrc0)
    if (numvalsrle==1) then
      strret = strret .. string.char (numvalsrc0) -- do not even attempt to
    else
      if (((numvalsrle~=2) and (numvalsrle~=3)) or ((numsrcind+numvalsrle-1)>numsrclen)) then
        strret = strret .. "XXX" -- truncated or invalid UTF8 stream in input
        break -- outer loop
      end--if
      numvalsrc1 = xxpick () -- ZERO and ONE sudah excluded, must be 2 or 3
      if (numvalsrle==3) then
        numvalsrc2 = xxpick () -- !!!FIXME!!! 4-oct does NOT work yet
      end--if
      strret = strret .. lfdokonv (numvalsrle,numvalsrc0,numvalsrc1,numvalsrc2,booalsoeks)
    end--if
  end--while

  ---- RETURN THE JUNK STRING ----

  return strret -- can be empty

end--function

  ---- RETURN THE JUNK LUA TABLE ----

return exporttable