モジュール:サンドボックス/likibp/GraphemeSplitter

提供:Wikisource
モジュールの解説[作成]
---@diagnostic disable: deprecated
--[[
GraphemeSplitter is a module designed for the Scribunto extension to MediaWiki.
It provides an implementation of the Unicode grapheme cluster splitting algorithm in Lua.
A grapheme cluster represents an individual visual character in Unicode.

This module is a port of the original source found at:
https://github.com/ufcpp/GraphemeSplitter

The source code is licensed under the MIT license:
https://opensource.org/licenses/MIT

Note: This source code, when used on or obtained from a wiki site operated by the Wikimedia Foundation,
is additionally licensed under the Creative Commons Attribution-ShareAlike License:
https://creativecommons.org/licenses/by-sa/4.0/deed
--]]


require('strict')
local checkType = require('libraryUtil').checkType
--local character = require("Module:GraphemeSplitter/GetGraphemeBreakProperty")
local character = require("モジュール:サンドボックス/likibp/GraphemeSplitter/GetGraphemeBreakProperty")

---Class to split a string into its grapheme clusters.
---@class StringSplitter
---@field _str string The input string being processed for grapheme cluster splitting.
---@field _index number Tracks the current position or index within the input string, helping to identify boundaries.
---@field _count number Represents the length of the current grapheme cluster from the current index.
local StringSplitter = {}

---Initializes a new instance of the StringSplitter class.
---Sets up the initial state with the provided string and resets internal counters
---@param inputString string The input string to be split into grapheme clusters.
---@return StringSplitter # Instance of the StringSplitter class
function StringSplitter.new(inputString)
    checkType('new', 1, inputString, 'string')
    local obj = {}
    obj._str = inputString
    obj._index = 1
    obj._count = 0
    return setmetatable(obj, { __index = StringSplitter })
end

--- Acquisition of the field variable `_str`.
---@return string # The current index position within the string being processed.
function StringSplitter:getStr()
    return self._str
end

--- Acquisition of the field variable `_index`.
---@return number # The current index position within the string being processed.
function StringSplitter:getIndex()
    return self._index
end

---Outputs the current state of the StringSplitter to the console.
---Displays the remaining substring, the entire string, current index,
---length of the current grapheme cluster, and total string length.
function StringSplitter:dump()
    mw.log(mw.ustring.sub(self._str, self._index + 1), self._str, self._index, self._count, mw.ustring.len(self._str))
end

---Advances the current index to the boundary of the next grapheme cluster.
---If the updated index exceeds the string length, it returns `false` indicating no more clusters to move to.
---Otherwise, it calculates the next break position and returns `true`.
---@return boolean # `true` if successfully moved to the next cluster, otherwise `false`.
function StringSplitter:moveNext()
    self._index = self._index + self._count;
    if self._index >= mw.ustring.len(self._str) then
        return false
    end
    self._count = self:nextBreak(self._index)
    return true
end

-- Determine the length from a given index to the boundary of the next grapheme cluster.
-- Repeatedly checks the boundary based on the features of each character.
-- If a boundary is found before the end of the string is reached, return its length.
---@param startIndex number Starting point for finding the next grapheme cluster boundary.
---@return number # Length from the specified index to the next grapheme cluster boundary.
function StringSplitter:nextBreak(startIndex)
    local currentCodePoint = self:codePointAt(startIndex)
    local clusterLength = 1
    while (startIndex + clusterLength < mw.ustring.len(self._str)) do
        local nextCodePoint = self:codePointAt(startIndex + clusterLength)
        if (self:shouldBreak(currentCodePoint, nextCodePoint)) then
            return clusterLength
        end
        clusterLength    = clusterLength + 1
        currentCodePoint = nextCodePoint
    end
    return clusterLength
end

---Retrieves the Unicode code point at the specified index.
---Considers surrogate pairs, returning the code point and its length.
---@param targetIndex number Index to fetch the code point from.
---@return number codePoint Unicode code point at the specified index.
function StringSplitter:codePointAt(targetIndex)
    local codepoint = mw.ustring.codepoint(self._str, targetIndex)
    return codepoint
end

--[[
    Note:
     shouldBreak method basically implements http://unicode.org/reports/tr29/
     but slacks out the GB10, GB12, and GB13 rules for simplification.

     original:
     GB10 (E_Base | EBG) Extend* × E_Modifier
     GB12 sot (RI RI)* RI × RI
     GB13 [^RI] (RI RI)* RI × RI

     implemented:
     GB10 (E_Base | EBG) × Extend
     GB10 (E_Base | EBG | Extend) × E_Modifier
     GB12/GB13 RI × RI

     e.g.
     sequence | original | implemented
     --- | --- | ---
     '👩' '🏻' ZWJ '👩' | × × ×    | × × ×
     'a' '🏻' ZWJ '👩'  | ÷ ÷ ×    | ÷ × ×
     🇯🇵🇺🇸 | × ÷ × | × × ×
--]]


---Determines whether a grapheme cluster boundary exists between two consecutive characters specified by a codepoint.
---This method checks the properties of the given codepoints against the Unicode grapheme cluster breaking rules.
---@param currCodePoint number The code point of the current character.
---@param nextCodePoint number The code point of the next character.
---@return boolean # `true` if a boundary exists between the two consecutive characters, otherwise `false`.
function StringSplitter:shouldBreak(currCodePoint, nextCodePoint)
    local currProp = character.getGraphemeBreakProperty(currCodePoint)
    local nextProp = character.getGraphemeBreakProperty(nextCodePoint)

    -- Do not break between a CR and LF. Otherwise, break before and after controls.
    -- GB3 CR × LF
    -- GB4 (Control | CR | LF) ÷
    -- GB5  ÷ (Control | CR | LF)
    if (currProp == "CR" and nextProp == "LF") then
        return false
    end
    if (currProp == "Control" or currProp == "CR" or currProp == "LF") then
        return true
    end
    if (nextProp == "Control" or nextProp == "CR" or nextProp == "LF") then
        return true
    end

    -- Do not break Hangul syllable sequences.
    -- GB6 L × (L | V | LV | LVT)
    -- GB7 (LV | V) × (V | T)
    -- GB8 (LVT | T) × T
    if (currProp == "L"
            and (nextProp == "L" or nextProp == "V" or nextProp == "LV" or nextProp == "LVT")) then
        return false
    end
    if ((currProp == "LV" or currProp == "V")
            and (nextProp == "V" or nextProp == "T")) then
        return false
    end
    if ((currProp == "LVT" or currProp == "V")
            and (nextProp == "T")) then
        return false
    end

    -- Do not break before extending characters or ZWJ.
    -- GB9   × (Extend | ZWJ)
    if (nextProp == "Extend" or nextProp == "ZWJ") then
        return false
    end

    -- Do not break before SpacingMarks, or after Prepend characters.
    -- GB9a   × SpacingMark
    -- GB9b Prepend ×
    if (nextProp == "SpacingMark") then
        return false
    end
    if (currProp == "Prepend") then
        return false
    end

    -- Do not break within emoji modifier sequences or emoji zwj sequences.
    -- GB10 (E_Base | EBG) × Extend
    -- GB10 (E_Base | EBG | Extend) × E_Modifier
    -- GB11 ZWJ × (Glue_After_Zwj | EBG)
    if ((currProp == "E_Base" or currProp == "E_Base_GAZ")
            and nextProp == "Extend") then
        return false
    end
    if ((currProp == "E_Base" or currProp == "E_Base_GAZ" or currProp == "Extend")
            and nextProp == "E_Modifier") then
        return false
    end
    if (currProp == "ZWJ"
            and (nextProp == "Glue_After_Zwj" or nextProp == "E_Base_GAZ")) then
        return false
    end

    -- Do not break within emoji flag sequences.
    -- GB12/GB13 RI × RI
    if (currProp == "Regional_Indicator" and nextProp == "Regional_Indicator") then
        return false
    end

    return true
end

---Resets the StringSplitter to its initial state.
---This sets the internal position back to the start, allowing for a new iteration over the string's grapheme clusters.
function StringSplitter:reset()
    self._index = 1
    self._count = 0
end
--[[
local exports = {}
---Splits the given string into its individual grapheme clusters.
---This function uses the StringSplitter class to iterate over the string and
---extract each grapheme cluster, which may consist of one or more characters
---@param inputString string The string to be split into grapheme clusters.
---@return string[] # An array of strings, each representing a single grapheme cluster.
exports.split = function(inputString)
    local splitter = StringSplitter.new(inputString)
    local counter = 0
    ---@type string[]
    local clusterArray = {}
    if (splitter:moveNext()) then
        local prevIndex = splitter:getIndex()
        while (splitter:moveNext()) do
        	mw.log(prevIndex,splitter:getIndex())
            local clusterItem = mw.ustring.sub(splitter:getStr(), prevIndex, splitter:getIndex())
            prevIndex = splitter:getIndex()
            table.insert(clusterArray, clusterItem)
            counter = counter + 1
        end
        counter = counter + 1
        local clusterItem = mw.ustring.sub(splitter:getStr(), prevIndex, splitter:getIndex())
        table.insert(clusterArray, clusterItem)
    end
    return clusterArray
end

return exports
--]]
return StringSplitter