Modulo:mundecorate

El Vikivortaro
Salti al navigilo Salti al serĉilo



  • prilaboras signoĉenon konvertante certajn latinajn signojn kun ĉapeloj al krudaj latinaj literoj "A"..."Z" kaj "a"..."z"
  • Uzata en {{sendekoraciigo}}


--[===[

MODULE "MUNDECORATE" (undecorate)

"eo.wiktionary.org/wiki/Modulo:mundecorate" <!--2021-Dec-29-->

Purpose: processes a string converting certain decorated Latin
         letters to a raw Latin ASCII letters "A"..."Z" and "a"..."z"

Utilo: prilaboras signocxenon konvertante certajn ...

Manfaat: mengonversi sebuah string ...

Syfte: bearbetar en straeng genom att konvertera vissa dekorerade latinska
       bokstaever till raaa latinska ASCII bokstaever "A"..."Z" och "a"..."z"

Used by templates / Uzata far sxablonoj:
- ...

Required submodules / Bezonataj submoduloj / Submodul yang diperlukan: none

This module can accept parameters whether sent to itself (own frame) or
to the caller (caller's frame). If there is a parameter "caller=true"
on the own frame then that own frame is discarded in favor of the
caller's one.

Parameters: * 1 anonymous obligatory parameter
              * UTF8 text (empty legal)
            * 1 anonymous optional parameter
              * "1" to include the optional chars (by default
                they are kept unchanged)

Returned: * ASCII text (empty can occur)

This module is unbreakable (when called with correct module name
and function name).

Cxi tiu modulo estas nerompebla (kiam vokita kun gxustaj nomo de modulo
kaj nomo de funkcio).

Special diagnostic strings in output:
* "XXX" -- truncated or invalid UTF8 stream in input
* "YYY" -- 4-oct in input
* "ZZZ" -- internal error, broken "contab"

Format of data inside "contab":
* sequence of line subblocks
  * UINT8 : target ASCII code
  * n * UINT8 : sequence of source UTF8 char:s, b7 ZERO in last subsequent
                octet means that the char is optional (ZERO shifts the
                range from $80...$BF down to $00...$3F, conflict is
                impossible since we have the length in advance seized from
                beginning octet), the sequence of UTF8 char:s ends when
                either a valid ASCII letter is encountered (coming from next
                line, or ASCII "5" as the final termination value)
* ASCII "5" ($35) as termination value

]===]

local undecorate = {}

------------------------------------------------------------------------

---- CONSTANTS ----

------------------------------------------------------------------------

  local contab = {
    [0]=65,195,129,195,128,195,130,195,131,195,5,195,4,196,128,196,130,
    97,195,161,195,160,195,162,195,163,195,37,195,36,196,129,196,131,
    67,195,135,196,134,196,8,
    99,195,167,196,135,196,9,
    69,196,146,196,148,195,139,195,137,195,138,
    101,196,147,196,149,195,171,195,169,195,170,
    71,196,28,
    103,196,29,
    72,196,36,
    104,196,37,
    73,196,170,196,172,195,143,195,141,
    105,196,171,196,173,195,175,195,173,
    74,196,52,
    106,196,53,
    78,195,145,
    110,195,177,
    79,197,140,197,142,195,147,195,148,195,149,195,22,197,144,
    111,197,141,197,143,195,179,195,180,195,181,195,54,197,145,
    83,197,28,
    115,197,29,
    85,197,170,197,172,195,156,195,154,195,153,197,174,197,176,
    117,197,171,197,173,195,188,195,186,195,185,197,175,197,177,
    89,197,184,200,178,
    121,195,191,200,179,
    90,197,189,
    122,197,190,
    53
    }

------------------------------------------------------------------------

---- LOCAL MATH FUNCTIONS ----

------------------------------------------------------------------------

-- Local function MATHDIV

local function mathdiv (xdividend, xdivisor)
  local resultdiv = 0 -- DIV operator lacks in LUA :-(
  resultdiv = math.floor (xdividend / xdivisor)
  return resultdiv
end--function mathdiv

-- Local function MATHMOD

local function mathmod (xdividendo, xdivisoro)
  local resultmod = 0 -- MOD operator is "%" and bitwise AND operator lack too
  resultmod = xdividendo % xdivisoro
  return resultmod
end--function mathmod

------------------------------------------------------------------------

-- Local function MATHBITTEST

-- Find out whether single bit selected by ZERO-based index is "1" / "true".

-- Result has type "boolean".

-- This sub depends on "MATH FUNCTIONS"\"mathdiv"
-- and "MATH FUNCTIONS"\"mathmod".

local function mathbittest (numincoming, numbitindex)
  local boores = false
  while (true) do
    if ((numbitindex==0) or (numincoming==0)) then
      break -- we have either reached our bit or run out of bits
    end--if
    numincoming = mathdiv(numincoming,2) -- shift right
    numbitindex = numbitindex - 1 -- count down to ZERO
  end--while
  boores = (mathmod(numincoming,2)==1) -- pick bit
  return boores
end--function mathbittest

------------------------------------------------------------------------

---- LOCAL UTF8 FUNCTIONS ----

------------------------------------------------------------------------

-- Local function LFUTF8LENGTH

-- Measure length of a single UTF8 char, return ZERO if invalid.

-- Does NOT thoroughly check the validity, looks at 1 octet only.

-- Input  : * numbgoctet  -- beginning octet of a UTF8 char

-- Output : * numlen1234x -- number 1...4 or ZERO if invalid

local function lfutf8length (numbgoctet)
  local numlen1234x = 0
    if (numbgoctet<128) then
      numlen1234x = 1 -- $00...$7F -- ANSI/ASCII
    end--if
    if ((numbgoctet>=194) and (numbgoctet<=223)) then
      numlen1234x = 2 -- $C2 to $DF
    end--if
    if ((numbgoctet>=224) and (numbgoctet<=239)) then
      numlen1234x = 3 -- $E0 to $EF
    end--if
    if ((numbgoctet>=240) and (numbgoctet<=244)) then
      numlen1234x = 4 -- $F0 to $F4
    end--if
  return numlen1234x
end--function lfutf8length

------------------------------------------------------------------------

---- LOCAL HIGH LEVEL FUNCTIONS ----

------------------------------------------------------------------------

-- Local function LFSPLITB7NOW

-- Split a UINT8 into b0...b6 (b7 always ONE) and separate b7 as boolean.

local function lfsplitb7now (nummain)
  local boob7bit = false
  boob7bit = mathbittest (nummain,7)
  if (not boob7bit) then
    nummain = nummain + 128
  end--if
  return nummain,boob7bit
end--function lfsplitb7now

------------------------------------------------------------------------

-- Local function LFDOKONV

-- Try to convert ie undecorate one UTF8 char.

-- Note that "numvlsrlink" can be 2 or 3 only, do NOT call this otherwise.

-- Note the inverted meaning of "b7" coming from "lfsplitb7now". ONE is
-- deafult, and ZERO means "the char is optional and excluded by default".

-- We depend on global "contab". The length is unknown but the
-- end is marked with a value 53.

-- Input  : * numvlsrlink -- 2 or 3
--          * booinkall -- "true" to convert the optional excluded ones too

-- Output : * strhasil -- 4 possibilities

local function lfdokonv (numvlsrlink,numvlsrc0,numvlsrc1,numvlsrc2,booinkall)

  local strhasil = ""
  local numresult = 0 -- 1 bad "YYY" 2 bad "ZZZ" 3 unchaged 4 f "numdestascii"
  local numdestascii = 0
  local numsrindex = 0
  local numpeeker0 = 0
  local numpeeker1 = 0
  local numpeeker2 = 0
  local numpanjang = 0 -- only 2 or 3 this is peeked as oppo to "numvlsrlink"
  local boonexk = false -- ZERO ie "false" if the char is optional

  if ((numvlsrlink~=2) and (numvlsrlink~=3)) then
    numresult = 1 -- "YYY"
  end--if

  while (true) do -- search "contab" -- over destination ASCII chars

    if (numresult~=0) then
      break -- outer loop -- have a result
    end--if

    numdestascii = contab[numsrindex] -- must be ASCII
    if ((numdestascii<65) or (numdestascii>122)) then
      numresult = 2 -- "ZZZ"
      break -- outer loop -- abort search due to broken data
    end--if
    numsrindex = numsrindex + 1

    while (true) do -- inner loop -- search "contab" -- over src UTF8 chars
      numpeeker0 = contab[numsrindex]
      if (numpeeker0<128) then -- next ASCII or terminator
        if (numpeeker0==53) then
          numresult = 3 -- no match, found terminator instead, keep unchanged
        end--if
        break -- inner loop only, repeek and reevaluate for outer loop
      end--if
      numsrindex = numsrindex + 1 -- do this AFTER check against ASCII !!!
      numpanjang = lfutf8length (numpeeker0)
      if ((numpanjang~=2) and (numpanjang~=3)) then
        numresult = 2 -- "ZZZ"
        break -- abort search due to broken data
      end--if
      numpeeker1 = contab[numsrindex]
      numpeeker1,boonexk = lfsplitb7now (numpeeker1)
      numsrindex = numsrindex + 1
      if (numpanjang==3) then
        numpeeker2 = contab[numsrindex]
        numpeeker2,boonexk = lfsplitb7now (numpeeker2) -- overwrite "boonexk"
        numsrindex = numsrindex + 1
      end--if
      while (true) do -- fake loop
        if (numpanjang~=numvlsrlink) then
          break -- wrong length, miss -- to join mark
        end--if
        if ((numpeeker0~=numvlsrc0) or (numpeeker1~=numvlsrc1)) then
          break -- miss -- to join mark
        end--if
        if ((numpanjang==3) and (numpeeker2~=numvlsrc2)) then
          break -- miss -- to join mark
        end--if
        if (boonexk or booinkall) then
          numresult = 4 -- replace, take the hit
        else
          numresult = 3 -- unchanged, discard the hit
        end--if
        break -- finally to join mark
      end--while -- fake loop -- join mark
      if (numresult~=0) then
        break -- have a result -- abort inner loop
      end--if
    end--while -- inner loop -- search "contab" -- over src UTF8 chars

    if (numresult~=0) then
      break -- have a result -- abort outer loop too
    end--if

  end--while -- search "contab" -- over destination ASCII chars

  if (numresult==1) then -- bad "numvlsrlink"
    strhasil = "YYY"
  end--if
  if (numresult==2) then -- bad "contab"
    strhasil = "ZZZ"
  end--if
  if (numresult==3) then -- unchanged
    if (numvlsrlink==3) then
      strhasil = string.char(numvlsrc0,numvlsrc1,numvlsrc2)
    else
      strhasil = string.char(numvlsrc0,numvlsrc1)
    end--if
  end--if
  if (numresult==4) then -- replace
    strhasil = string.char(numdestascii)
  end--if

  return strhasil

end--function lfdokonv

------------------------------------------------------------------------

---- MAIN EXPORTED FUNCTION ----

------------------------------------------------------------------------

function undecorate.ek (arxframent)

  -- special type "args" AKA "arx"

  local arxourown = 0  -- metaized "args" from our own "frame"
  local arxcaller = 0  -- metaized "args" from caller's "frame"

  -- general "str"

  local strintxt      = ""  -- input string from [0]
  local strret        = ""  -- result string

  -- general "num"

  local numsrclen     = 0
  local numsrcind     = 0
  local numvalsrc0    = 0
  local numvalsrc1    = 0
  local numvalsrc2    = 0
  local numvalsrle    = 0

  -- general "boo"

  local booeks        = false -- from [1]

  ---- GET THE ARX:ES ----

  arxourown = arxframent.args or {} -- "args" fr our own "frame"
  arxcaller = arxframent:getParent().args or {} -- "args" fr caller's "frame"

  ---- GET THE ARX (ONE OF TWO) ----

  arxsomons = arxframent.args -- "args" from our own "frame"
  if (type(arxsomons)~="table") then
    arxsomons = {} -- guard against indexing error from our own
  end--if
  if (arxsomons['caller']=="true") then
    arxsomons = arxframent:getParent().args -- "args" from caller's "frame"
  end--if
  if (type(arxsomons)~="table") then
    arxsomons = {} -- guard against indexing error again
  end--if

  ---- GET THE PARAMETERS ----

  strintxt = arxsomons[1]
  if (type(strintxt)~="string") then
    strintxt = ""
  end--if
  booeks = (arxsomons[2]=="1") -- bool -- "1" AKA "true" to take excluded too

  ---- CARRY OUT THE HARD WORK ----

    local function xxpick () -- upvalues "strintxt" (const) and "numsrcind" (upd)
      local xx = 0
      xx = string.byte (strintxt,(numsrcind+1),(numsrcind+1))
      numsrcind = numsrcind + 1
      return xx
    end--function xxpick

  numsrclen = string.len (strintxt)
  numsrcind = 0 -- ZERO-based
  strret = ""
  while (true) do
    if (numsrcind>=numsrclen) then
      break -- outer loop -- done
    end--if
    numvalsrc0 = xxpick ()
    numvalsrle = lfutf8length (numvalsrc0)
    if (numvalsrle==1) then
      strret = strret .. string.char (numvalsrc0) -- do not even attempt to
    else
      if (((numvalsrle~=2) and (numvalsrle~=3)) or ((numsrcind+numvalsrle-1)>numsrclen)) then
        strret = strret .. "XXX" -- truncated or invalid UTF8 stream in input
        break -- outer loop
      end--if
      numvalsrc1 = xxpick () -- ZERO and ONE sudah excluded, must be 2 or 3
      if (numvalsrle==3) then
        numvalsrc2 = xxpick () -- !!!FIXME!!! 4-oct does NOT work yet
      end--if
      strret = strret .. lfdokonv (numvalsrle,numvalsrc0,numvalsrc1,numvalsrc2,booeks)
    end--if
  end--while

  ---- RETURN THE JUNK STRING ----

  return strret -- can be empty

end--function

  ---- RETURN THE JUNK LUA TABLE ----

return undecorate