モジュール:サンドボックス/likibp/GraphemeSplitter
表示
モジュールの解説[作成]
---@diagnostic disable: deprecated
--[[
GraphemeSplitter is a module designed for the Scribunto extension to MediaWiki.
It provides an implementation of the Unicode grapheme cluster splitting algorithm in Lua.
A grapheme cluster represents an individual visual character in Unicode.
This module is a port of the original source found at:
https://github.com/ufcpp/GraphemeSplitter
The source code is licensed under the MIT license:
https://opensource.org/licenses/MIT
Note: This source code, when used on or obtained from a wiki site operated by the Wikimedia Foundation,
is additionally licensed under the Creative Commons Attribution-ShareAlike License:
https://creativecommons.org/licenses/by-sa/4.0/deed
--]]
require('strict')
local checkType = require('libraryUtil').checkType
--local character = require("Module:GraphemeSplitter/GetGraphemeBreakProperty")
local character = require("モジュール:サンドボックス/likibp/GraphemeSplitter/GetGraphemeBreakProperty")
---Class to split a string into its grapheme clusters.
---@class StringSplitter
---@field _str string The input string being processed for grapheme cluster splitting.
---@field _index number Tracks the current position or index within the input string, helping to identify boundaries.
---@field _count number Represents the length of the current grapheme cluster from the current index.
local StringSplitter = {}
---Initializes a new instance of the StringSplitter class.
---Sets up the initial state with the provided string and resets internal counters
---@param inputString string The input string to be split into grapheme clusters.
---@return StringSplitter # Instance of the StringSplitter class
function StringSplitter.new(inputString)
checkType('new', 1, inputString, 'string')
local obj = {}
obj._str = inputString
obj._index = 1
obj._count = 0
return setmetatable(obj, { __index = StringSplitter })
end
--- Acquisition of the field variable `_str`.
---@return string # The current index position within the string being processed.
function StringSplitter:getStr()
return self._str
end
--- Acquisition of the field variable `_index`.
---@return number # The current index position within the string being processed.
function StringSplitter:getIndex()
return self._index
end
---Outputs the current state of the StringSplitter to the console.
---Displays the remaining substring, the entire string, current index,
---length of the current grapheme cluster, and total string length.
function StringSplitter:dump()
mw.log(mw.ustring.sub(self._str, self._index + 1), self._str, self._index, self._count, mw.ustring.len(self._str))
end
---Advances the current index to the boundary of the next grapheme cluster.
---If the updated index exceeds the string length, it returns `false` indicating no more clusters to move to.
---Otherwise, it calculates the next break position and returns `true`.
---@return boolean # `true` if successfully moved to the next cluster, otherwise `false`.
function StringSplitter:moveNext()
self._index = self._index + self._count;
if self._index >= mw.ustring.len(self._str) then
return false
end
self._count = self:nextBreak(self._index)
return true
end
-- Determine the length from a given index to the boundary of the next grapheme cluster.
-- Repeatedly checks the boundary based on the features of each character.
-- If a boundary is found before the end of the string is reached, return its length.
---@param startIndex number Starting point for finding the next grapheme cluster boundary.
---@return number # Length from the specified index to the next grapheme cluster boundary.
function StringSplitter:nextBreak(startIndex)
local currentCodePoint = self:codePointAt(startIndex)
local clusterLength = 1
while (startIndex + clusterLength < mw.ustring.len(self._str)) do
local nextCodePoint = self:codePointAt(startIndex + clusterLength)
if (self:shouldBreak(currentCodePoint, nextCodePoint)) then
return clusterLength
end
clusterLength = clusterLength + 1
currentCodePoint = nextCodePoint
end
return clusterLength
end
---Retrieves the Unicode code point at the specified index.
---Considers surrogate pairs, returning the code point and its length.
---@param targetIndex number Index to fetch the code point from.
---@return number codePoint Unicode code point at the specified index.
function StringSplitter:codePointAt(targetIndex)
local codepoint = mw.ustring.codepoint(self._str, targetIndex)
return codepoint
end
--[[
Note:
shouldBreak method basically implements http://unicode.org/reports/tr29/
but slacks out the GB10, GB12, and GB13 rules for simplification.
original:
GB10 (E_Base | EBG) Extend* × E_Modifier
GB12 sot (RI RI)* RI × RI
GB13 [^RI] (RI RI)* RI × RI
implemented:
GB10 (E_Base | EBG) × Extend
GB10 (E_Base | EBG | Extend) × E_Modifier
GB12/GB13 RI × RI
e.g.
sequence | original | implemented
--- | --- | ---
'👩' '🏻' ZWJ '👩' | × × × | × × ×
'a' '🏻' ZWJ '👩' | ÷ ÷ × | ÷ × ×
🇯🇵🇺🇸 | × ÷ × | × × ×
--]]
---Determines whether a grapheme cluster boundary exists between two consecutive characters specified by a codepoint.
---This method checks the properties of the given codepoints against the Unicode grapheme cluster breaking rules.
---@param currCodePoint number The code point of the current character.
---@param nextCodePoint number The code point of the next character.
---@return boolean # `true` if a boundary exists between the two consecutive characters, otherwise `false`.
function StringSplitter:shouldBreak(currCodePoint, nextCodePoint)
local currProp = character.getGraphemeBreakProperty(currCodePoint)
local nextProp = character.getGraphemeBreakProperty(nextCodePoint)
-- Do not break between a CR and LF. Otherwise, break before and after controls.
-- GB3 CR × LF
-- GB4 (Control | CR | LF) ÷
-- GB5 ÷ (Control | CR | LF)
if (currProp == "CR" and nextProp == "LF") then
return false
end
if (currProp == "Control" or currProp == "CR" or currProp == "LF") then
return true
end
if (nextProp == "Control" or nextProp == "CR" or nextProp == "LF") then
return true
end
-- Do not break Hangul syllable sequences.
-- GB6 L × (L | V | LV | LVT)
-- GB7 (LV | V) × (V | T)
-- GB8 (LVT | T) × T
if (currProp == "L"
and (nextProp == "L" or nextProp == "V" or nextProp == "LV" or nextProp == "LVT")) then
return false
end
if ((currProp == "LV" or currProp == "V")
and (nextProp == "V" or nextProp == "T")) then
return false
end
if ((currProp == "LVT" or currProp == "V")
and (nextProp == "T")) then
return false
end
-- Do not break before extending characters or ZWJ.
-- GB9 × (Extend | ZWJ)
if (nextProp == "Extend" or nextProp == "ZWJ") then
return false
end
-- Do not break before SpacingMarks, or after Prepend characters.
-- GB9a × SpacingMark
-- GB9b Prepend ×
if (nextProp == "SpacingMark") then
return false
end
if (currProp == "Prepend") then
return false
end
-- Do not break within emoji modifier sequences or emoji zwj sequences.
-- GB10 (E_Base | EBG) × Extend
-- GB10 (E_Base | EBG | Extend) × E_Modifier
-- GB11 ZWJ × (Glue_After_Zwj | EBG)
if ((currProp == "E_Base" or currProp == "E_Base_GAZ")
and nextProp == "Extend") then
return false
end
if ((currProp == "E_Base" or currProp == "E_Base_GAZ" or currProp == "Extend")
and nextProp == "E_Modifier") then
return false
end
if (currProp == "ZWJ"
and (nextProp == "Glue_After_Zwj" or nextProp == "E_Base_GAZ")) then
return false
end
-- Do not break within emoji flag sequences.
-- GB12/GB13 RI × RI
if (currProp == "Regional_Indicator" and nextProp == "Regional_Indicator") then
return false
end
return true
end
---Resets the StringSplitter to its initial state.
---This sets the internal position back to the start, allowing for a new iteration over the string's grapheme clusters.
function StringSplitter:reset()
self._index = 1
self._count = 0
end
--[[
local exports = {}
---Splits the given string into its individual grapheme clusters.
---This function uses the StringSplitter class to iterate over the string and
---extract each grapheme cluster, which may consist of one or more characters
---@param inputString string The string to be split into grapheme clusters.
---@return string[] # An array of strings, each representing a single grapheme cluster.
exports.split = function(inputString)
local splitter = StringSplitter.new(inputString)
local counter = 0
---@type string[]
local clusterArray = {}
if (splitter:moveNext()) then
local prevIndex = splitter:getIndex()
while (splitter:moveNext()) do
mw.log(prevIndex,splitter:getIndex())
local clusterItem = mw.ustring.sub(splitter:getStr(), prevIndex, splitter:getIndex())
prevIndex = splitter:getIndex()
table.insert(clusterArray, clusterItem)
counter = counter + 1
end
counter = counter + 1
local clusterItem = mw.ustring.sub(splitter:getStr(), prevIndex, splitter:getIndex())
table.insert(clusterArray, clusterItem)
end
return clusterArray
end
return exports
--]]
return StringSplitter