local m_str_utils = require("Module:string utilities")
local export = {}
local concat = table.concat
local encode_entities = m_str_utils.encode_entities
local get_category = require("Module:maintenance category").get_category
local get_etym_lang = require("Module:etymology languages").getByCanonicalName
local insert = table.insert
local ipairs = ipairs
local list_to_set = require("Module:table").listToSet
local new_title = mw.title.new
local remove_comments = m_str_utils.remove_comments
local split = m_str_utils.split
local toNFD = mw.ustring.toNFD
local type = type
local type_or_class = require("Module:parser").type_or_class
local u = m_str_utils.char
local ugsub = mw.ustring.gsub
-- Convert a numeric list of characters and ranges to the equivalent Lua pattern. WARNING: This destructively modifies
-- the contents of `ranges`.
local function char_ranges_to_pattern(ranges)
for j, range in ipairs(ranges) do
if type(range) == "table" then
for k, char in ipairs(range) do
range[k] = u(char)
end
ranges[j] = concat(range, "-")
else
ranges[j] = u(range)
end
end
return concat(ranges)
end
-- Combining character data used when categorising unusual characters. These resolve into two patterns, used to find
-- single combining characters (i.e. character + diacritic(s)) or double combining characters (i.e. character +
-- diacritic(s) + character).
local comb_chars = {
single = {
{0x0300, 0x034E},
-- Exclude combining grapheme joiner.
{0x0350, 0x035B},
{0x0363, 0x036F},
{0x0483, 0x0489},
{0x0591, 0x05BD},
0x05BF,
{0x05C1, 0x05C2},
{0x05C4, 0x05C5},
0x05C7,
{0x0610, 0x061A},
{0x064B, 0x065F},
0x0670,
{0x06D6, 0x06DC},
{0x06DF, 0x06E4},
{0x06E7, 0x06E8},
{0x06EA, 0x06ED},
0x0711,
{0x0730, 0x074A},
{0x07A6, 0x07B0},
{0x07EB, 0x07F3},
0x07FD,
{0x0816, 0x0819},
{0x081B, 0x0823},
{0x0825, 0x0827},
{0x0829, 0x082D},
{0x0859, 0x085B},
{0x0898, 0x089F},
{0x08CA, 0x08E1},
{0x08E3, 0x0903},
{0x093A, 0x093C},
{0x093E, 0x094F},
{0x0951, 0x0957},
{0x0962, 0x0963},
{0x0981, 0x0983},
0x09BC,
{0x09BE, 0x09C4},
{0x09C7, 0x09C8},
{0x09CB, 0x09CD},
0x09D7,
{0x09E2, 0x09E3},
0x09FE,
{0x0A01, 0x0A03},
0x0A3C,
{0x0A3E, 0x0A42},
{0x0A47, 0x0A48},
{0x0A4B, 0x0A4D},
0x0A51,
{0x0A70, 0x0A71},
0x0A75,
{0x0A81, 0x0A83},
0x0ABC,
{0x0ABE, 0x0AC5},
{0x0AC7, 0x0AC9},
{0x0ACB, 0x0ACD},
{0x0AE2, 0x0AE3},
{0x0AFA, 0x0AFF},
{0x0B01, 0x0B03},
0x0B3C,
{0x0B3E, 0x0B44},
{0x0B47, 0x0B48},
{0x0B4B, 0x0B4D},
{0x0B55, 0x0B57},
{0x0B62, 0x0B63},
0x0B82,
{0x0BBE, 0x0BC2},
{0x0BC6, 0x0BC8},
{0x0BCA, 0x0BCD},
0x0BD7,
{0x0C00, 0x0C04},
0x0C3C,
{0x0C3E, 0x0C44},
{0x0C46, 0x0C48},
{0x0C4A, 0x0C4D},
{0x0C55, 0x0C56},
{0x0C62, 0x0C63},
{0x0C81, 0x0C83},
0x0CBC,
{0x0CBE, 0x0CC4},
{0x0CC6, 0x0CC8},
{0x0CCA, 0x0CCD},
{0x0CD5, 0x0CD6},
{0x0CE2, 0x0CE3},
0x0CF3,
{0x0D00, 0x0D03},
{0x0D3B, 0x0D3C},
{0x0D3E, 0x0D44},
{0x0D46, 0x0D48},
{0x0D4A, 0x0D4D},
0x0D57,
{0x0D62, 0x0D63},
{0x0D81, 0x0D83},
0x0DCA,
{0x0DCF, 0x0DD4},
0x0DD6,
{0x0DD8, 0x0DDF},
{0x0DF2, 0x0DF3},
0x0E31,
{0x0E34, 0x0E3A},
{0x0E47, 0x0E4E},
0x0EB1,
{0x0EB4, 0x0EBC},
{0x0EC8, 0x0ECE},
{0x0F18, 0x0F19},
0x0F35,
0x0F37,
0x0F39,
{0x0F3E, 0x0F3F},
{0x0F71, 0x0F84},
{0x0F86, 0x0F87},
{0x0F8D, 0x0F97},
{0x0F99, 0x0FBC},
0x0FC6,
{0x102B, 0x103E},
{0x1056, 0x1059},
{0x105E, 0x1060},
{0x1062, 0x1064},
{0x1067, 0x106D},
{0x1071, 0x1074},
{0x1082, 0x108D},
0x108F,
{0x109A, 0x109D},
{0x135D, 0x135F},
{0x1712, 0x1715},
{0x1732, 0x1734},
{0x1752, 0x1753},
{0x1772, 0x1773},
{0x17B4, 0x17D3},
0x17DD,
-- Exclude Mongolian variation selectors.
{0x1885, 0x1886},
0x18A9,
{0x1920, 0x192B},
{0x1930, 0x193B},
{0x1A17, 0x1A1B},
{0x1A55, 0x1A5E},
{0x1A60, 0x1A7C},
0x1A7F,
{0x1AB0, 0x1ACE},
{0x1B00, 0x1B04},
{0x1B34, 0x1B44},
{0x1B6B, 0x1B73},
{0x1B80, 0x1B82},
{0x1BA1, 0x1BAD},
{0x1BE6, 0x1BF3},
{0x1C24, 0x1C37},
{0x1CD0, 0x1CD2},
{0x1CD4, 0x1CE8},
0x1CED,
0x1CF4,
{0x1CF7, 0x1CF9},
{0x1DC0, 0x1DCC},
{0x1DCE, 0x1DFB},
{0x1DFD, 0x1DFF},
{0x20D0, 0x20F0},
{0x2CEF, 0x2CF1},
0x2D7F,
{0x2DE0, 0x2DFF},
{0x302A, 0x302F},
{0x3099, 0x309A},
{0xA66F, 0xA672},
{0xA674, 0xA67D},
{0xA69E, 0xA69F},
{0xA6F0, 0xA6F1},
0xA802,
0xA806,
0xA80B,
{0xA823, 0xA827},
0xA82C,
{0xA880, 0xA881},
{0xA8B4, 0xA8C5},
{0xA8E0, 0xA8F1},
0xA8FF,
{0xA926, 0xA92D},
{0xA947, 0xA953},
{0xA980, 0xA983},
{0xA9B3, 0xA9C0},
0xA9E5,
{0xAA29, 0xAA36},
0xAA43,
{0xAA4C, 0xAA4D},
{0xAA7B, 0xAA7D},
0xAAB0,
{0xAAB2, 0xAAB4},
{0xAAB7, 0xAAB8},
{0xAABE, 0xAABF},
0xAAC1,
{0xAAEB, 0xAAEF},
{0xAAF5, 0xAAF6},
{0xABE3, 0xABEA},
{0xABEC, 0xABED},
0xFB1E,
{0xFE20, 0xFE2F},
0x101FD,
0x102E0,
{0x10376, 0x1037A},
{0x10A01, 0x10A03},
{0x10A05, 0x10A06},
{0x10A0C, 0x10A0F},
{0x10A38, 0x10A3A},
0x10A3F,
{0x10AE5, 0x10AE6},
{0x10D24, 0x10D27},
{0x10EAB, 0x10EAC},
{0x10EFD, 0x10EFF},
{0x10F46, 0x10F50},
{0x10F82, 0x10F85},
{0x11000, 0x11002},
{0x11038, 0x11046},
0x11070,
{0x11073, 0x11074},
{0x1107F, 0x11082},
{0x110B0, 0x110BA},
0x110C2,
{0x11100, 0x11102},
{0x11127, 0x11134},
{0x11145, 0x11146},
0x11173,
{0x11180, 0x11182},
{0x111B3, 0x111C0},
{0x111C9, 0x111CC},
{0x111CE, 0x111CF},
{0x1122C, 0x11237},
0x1123E,
0x11241,
{0x112DF, 0x112EA},
{0x11300, 0x11303},
{0x1133B, 0x1133C},
{0x1133E, 0x11344},
{0x11347, 0x11348},
{0x1134B, 0x1134D},
0x11357,
{0x11362, 0x11363},
{0x11366, 0x1136C},
{0x11370, 0x11374},
{0x11435, 0x11446},
0x1145E,
{0x114B0, 0x114C3},
{0x115AF, 0x115B5},
{0x115B8, 0x115C0},
{0x115DC, 0x115DD},
{0x11630, 0x11640},
{0x116AB, 0x116B7},
{0x1171D, 0x1172B},
{0x1182C, 0x1183A},
{0x11930, 0x11935},
{0x11937, 0x11938},
{0x1193B, 0x1193E},
0x11940,
{0x11942, 0x11943},
{0x119D1, 0x119D7},
{0x119DA, 0x119E0},
0x119E4,
{0x11A01, 0x11A0A},
{0x11A33, 0x11A39},
{0x11A3B, 0x11A3E},
0x11A47,
{0x11A51, 0x11A5B},
{0x11A8A, 0x11A99},
{0x11C2F, 0x11C36},
{0x11C38, 0x11C3F},
{0x11C92, 0x11CA7},
{0x11CA9, 0x11CB6},
{0x11D31, 0x11D36},
0x11D3A,
{0x11D3C, 0x11D3D},
{0x11D3F, 0x11D45},
0x11D47,
{0x11D8A, 0x11D8E},
{0x11D90, 0x11D91},
{0x11D93, 0x11D97},
{0x11EF3, 0x11EF6},
{0x11F00, 0x11F01},
0x11F03,
{0x11F34, 0x11F3A},
{0x11F3E, 0x11F42},
0x13440,
{0x13447, 0x13455},
{0x16AF0, 0x16AF4},
{0x16B30, 0x16B36},
0x16F4F,
{0x16F51, 0x16F87},
{0x16F8F, 0x16F92},
-- Exclude Khitan Small Script filler.
{0x16FF0, 0x16FF1},
{0x1BC9D, 0x1BC9E},
{0x1CF00, 0x1CF2D},
{0x1CF30, 0x1CF46},
{0x1D165, 0x1D169},
{0x1D16D, 0x1D172},
{0x1D17B, 0x1D182},
{0x1D185, 0x1D18B},
{0x1D1AA, 0x1D1AD},
{0x1D242, 0x1D244},
{0x1DA00, 0x1DA36},
{0x1DA3B, 0x1DA6C},
0x1DA75,
0x1DA84,
{0x1DA9B, 0x1DA9F},
{0x1DAA1, 0x1DAAF},
{0x1E000, 0x1E006},
{0x1E008, 0x1E018},
{0x1E01B, 0x1E021},
{0x1E023, 0x1E024},
{0x1E026, 0x1E02A},
0x1E08F,
{0x1E130, 0x1E136},
0x1E2AE,
{0x1E2EC, 0x1E2EF},
{0x1E4EC, 0x1E4EF},
{0x1E8D0, 0x1E8D6},
{0x1E944, 0x1E94A},
},
double = {
{0x035C, 0x0362},
0x1DCD,
0x1DFC,
},
vs = { -- variation selectors; separated out so that we don't get categories for them
{0xFE00, 0xFE0F},
{0xE0100, 0xE01EF},
}
}
for key, charset in pairs(comb_chars) do
comb_chars[key] = char_ranges_to_pattern(charset)
end
comb_chars.both = comb_chars.single .. comb_chars.double .. comb_chars.vs
comb_chars = {
combined_single = "[^" .. comb_chars.both .. "][" .. comb_chars.single .. comb_chars.vs .. "]+%f[^" .. comb_chars.both .. "]",
combined_double = "[^" .. comb_chars.both .. "][" .. comb_chars.single .. comb_chars.vs .. "]*[" .. comb_chars.double .. "]+[" .. comb_chars.both .. "]*.[" .. comb_chars.single .. comb_chars.vs .. "]*",
diacritics_single = "[" .. comb_chars.single .. "]",
diacritics_double = "[" .. comb_chars.double .. "]"
}
-- From https://unicode.org/Public/emoji/15.1/emoji-sequences.txt
local emoji_chars = {
{0x231A, 0x231B}, -- watch..hourglass done # E0.6 [2] (⌚..⌛)
{0x23E9, 0x23EC}, -- fast-forward button..fast down button # E0.6 [4] (⏩..⏬)
0x23F0, -- alarm clock # E0.6 [1] (⏰)
0x23F3, -- hourglass not done # E0.6 [1] (⏳)
{0x25FD, 0x25FE}, -- white medium-small square..black medium-small square # E0.6 [2] (◽..◾)
{0x2614, 0x2615}, -- umbrella with rain drops..hot beverage # E0.6 [2] (☔..☕)
{0x2648, 0x2653}, -- Aries..Pisces # E0.6 [12] (♈..♓)
0x267F, -- wheelchair symbol # E0.6 [1] (♿)
0x2693, -- anchor # E0.6 [1] (⚓)
0x26A1, -- high voltage # E0.6 [1] (⚡)
{0x26AA, 0x26AB}, -- white circle..black circle # E0.6 [2] (⚪..⚫)
{0x26BD, 0x26BE}, -- soccer ball..baseball # E0.6 [2] (⚽..⚾)
{0x26C4, 0x26C5}, -- snowman without snow..sun behind cloud # E0.6 [2] (⛄..⛅)
0x26CE, -- Ophiuchus # E0.6 [1] (⛎)
0x26D4, -- no entry # E0.6 [1] (⛔)
0x26EA, -- church # E0.6 [1] (⛪)
{0x26F2, 0x26F3}, -- fountain..flag in hole # E0.6 [2] (⛲..⛳)
0x26F5, -- sailboat # E0.6 [1] (⛵)
0x26FA, -- tent # E0.6 [1] (⛺)
0x26FD, -- fuel pump # E0.6 [1] (⛽)
0x2705, -- check mark button # E0.6 [1] (✅)
{0x270A, 0x270B}, -- raised fist..raised hand # E0.6 [2] (✊..✋)
0x2728, -- sparkles # E0.6 [1] (✨)
0x274C, -- cross mark # E0.6 [1] (❌)
0x274E, -- cross mark button # E0.6 [1] (❎)
{0x2753, 0x2755}, -- red question mark..white exclamation mark # E0.6 [3] (❓..❕)
0x2757, -- red exclamation mark # E0.6 [1] (❗)
{0x2795, 0x2797}, -- plus..divide # E0.6 [3] (➕..➗)
0x27B0, -- curly loop # E0.6 [1] (➰)
0x27BF, -- double curly loop # E1.0 [1] (➿)
{0x2B1B, 0x2B1C}, -- black large square..white large square # E0.6 [2] (⬛..⬜)
0x2B50, -- star # E0.6 [1] (⭐)
0x2B55, -- hollow red circle # E0.6 [1] (⭕)
{0x1F300, 0x1FAFF}, -- emoji in Plane 1
-- NOTE: There are lots more emoji sequences involving non-emoji Plane 0 symbols followed by 0xFE0F, which we don't
-- (yet?) handle.
}
emoji_chars = char_ranges_to_pattern(emoji_chars)
local unsupported_characters = {}
for k, v in pairs(require("Module:links/data").unsupported_characters) do
unsupported_characters[v] = k
end
-- Get the list of unsupported titles and invert it (so the keys are pagenames and values are canonical titles).
local unsupported_titles = {}
for k, v in pairs(require("Module:links/data").unsupported_titles) do
unsupported_titles[v] = k
end
--[==[
Given a pagename (or {nil} for the current page), create and return a data structure describing the page. The returned
object includes the following fields:
* `comb_chars`: A table containing various Lua character class patterns for different types of combined characters
(those that decompose into multiple characters in the NFD decomposition). The patterns are meant to be used with
{mw.ustring.find()}. The keys are:
** `single`: Single combining characters (character + diacritic), without surrounding brackets;
** `double`: Double combining characters (character + diacritic + character), without surrounding brackets;
** `vs`: Variation selectors, without surrounding brackets;
** `both`: Concatenation of `single` + `double` + `vs` (FIXME: should be named `all`), without surrounding brackets;
** `diacritics_single`: Like `single` but with surrounding brackets;
** `diacritics_single`: Like `double` but with surrounding brackets;
** `combined_single`: Lua pattern for matching a spacing character followed by one or more single combining characters;
** `combined_double`: Lua pattern for matching a combination of two spacing characters separated by one or more double
combining characters, possibly also with single combining characters;
* `emoji_pattern`: A Lua character class pattern (including surrounding brackets) that matches emojis. Meant to be used
with {mw.ustring.find()}.
* `unsupported_titles`: Map from pagenames to canonical titles for unsupported-title pages.
* `namespace`: Namespace of the pagename.
* `full_raw_pagename`: Full version of the '''RAW''' pagename (i.e. unsupported-title pages aren't canonicalized);
including the namespace and the root (portion before the slash).
* `pagename`: Canonicalized subpage portion of the pagename (unsupported-title pages are canonicalized).
* `decompose_pagename`: Equivalent of `pagename` in NFD decomposition.
* `pagename_len`: Length of `pagename` in Unicode chars, where combinations of spacing character + decomposed diacritic
are treated as single characters.
* `explode_pagename`: Set of characters found in `pagename`. The keys are characters (where combinations of spacing
character + decomposed diacritic are treated as single characters).
* `encoded_pagename`: FIXME: Document me.
* `pagename_defaultsort`: FIXME: Document me.
* `raw_defaultsort`: FIXME: Document me.
* `page_L2s`: Lookup table of L2 headings on the page, where the key is the section number assigned by the preprocessor, and the value is the L2 heading name. Once an invocation has got its actual section number from get_current_section in [[Module:utilities]], it can use this table to determine its parent L2. TODO: We could expand this to include subsections, to check POS headings are correct etc.
* `wikitext_topic_cat`: FIXME: Document me.
* `wikitext_langname_cat`: FIXME: Document me.
]==]
function export.process_page(pagename)
local data = {}
data.comb_chars = comb_chars
data.emoji_pattern = "[" .. emoji_chars .. "]"
data.unsupported_titles = unsupported_titles
data.cats = {}
-- We cannot store `raw_title` in `data` because it contains a metatable.
local raw_title
local function bad_pagename()
if not pagename then
error("Internal error: Something wrong, `data.pagename` not specified but current title containg illegal characters")
else
error(("Bad value for `data.pagename`: '%s', which must not contain illegal characters"):format(pagename))
end
end
if pagename then -- for testing, doc pages, etc.
raw_title = new_title(pagename)
if not raw_title then
bad_pagename()
end
else
raw_title = mw.title.getCurrentTitle()
end
data.namespace = raw_title.nsText
data.full_raw_pagename = raw_title.fullText
local frame = mw.getCurrentFrame()
-- WARNING: `content` May be nil, e.g. if we're substing a template like {{ja-new}} on a not-yet-created page
-- or if the module specifies the subpage as `data.pagename` (which many modules do) and we're in an Appendix
-- or other non-mainspace page. We used to make the latter an error but there are too many modules that do it,
-- and substing on a nonexistent page is totally legit, and we don't actually need to be able to access the
-- content of the page.
local content = raw_title:getContent()
local content_lang = mw.getContentLanguage()
--Get the pagename.
pagename = raw_title.subpageText
:gsub("^不支援的頁面名稱/(.*)", function(m)
insert(data.cats, "不支援的頁面名稱")
return unsupported_titles[m] or (m:gsub("`.-`", unsupported_characters))
end)
-- Save pagename, as local variable will be destructively modified.
data.pagename = pagename
-- Decompose the pagename in Unicode normalization form D.
data.decompose_pagename = toNFD(pagename)
-- Explode the current page name into a character table, taking decomposed combining characters into account.
local explode_pagename = {}
local pagename_len = 0
local function explode(char)
explode_pagename[char] = true
pagename_len = pagename_len + 1
return ""
end
pagename = ugsub(pagename, comb_chars.combined_double, explode)
pagename = ugsub(pagename, comb_chars.combined_single, explode)
:gsub(".[\128-\191]*", explode)
data.explode_pagename = explode_pagename
data.pagename_len = pagename_len
-- Generate DEFAULTSORT.
data.encoded_pagename = encode_entities(data.pagename)
data.pagename_defaultsort = require("Module:languages").getByCode("mul"):makeSortKey(data.encoded_pagename)
frame:callParserFunction(
"DEFAULTSORT",
data.pagename_defaultsort
)
data.raw_defaultsort = raw_title.text:uupper()
-- Get section numbers for the page, and note raw wikitext use of {{DEFAULTSORT:}} and {{DISPLAYTITLE:}}.
-- Note: HTML comments shouldn't be removed from `content` until after this step, as they can affect the result.
do
local page_L2s, defaultsort, displaytitle, page_has_L1 = {}
local function handle_heading(node)
local level = node.level
if level > 2 then
return
end
local name = node:get_name()
-- Check there are no newline characters in the heading, which might appear after preprocessing (e.g. from an expanded template). In such cases, the preprocessor section count still increments (since it's calculated pre-expansion), but the heading will fail, so we shouldn't increment the L2 count.
if name:find("\n", 1, true) then
return
end
page_L2s[node.section] = name
-- We also add any L1s, since they terminate the preceding L2, but add a maintenance category since it's probably a mistake.
if level == 1 and not page_has_L1 then
page_has_L1 = get_category("Pages with unwanted L1 headings")
end
end
local function handle_template(node)
local name = node:get_name()
if name == "DEFAULTSORT" and not defaultsort then
defaultsort = get_category("Pages with DEFAULTSORT conflicts")
elseif name == "DISPLAYTITLE" and not displaytitle then
displaytitle = get_category("Pages with DISPLAYTITLE conflicts")
end
end
if content then
for node in require("Module:template parser").parse(content):__pairs("next_node") do
local node_type = type_or_class(node)
if node_type == "heading" then
handle_heading(node)
elseif node_type == "template" and not (defaultsort and displaytitle) then
handle_template(node)
end
end
end
data.page_L2s = page_L2s
insert(data.cats, defaultsort)
insert(data.cats, displaytitle)
insert(data.cats, page_has_L1)
end
------ 4. Parse page for maintenance categories. ------
-- Use of tab characters.
if content and content:find("\t") then
insert(data.cats, get_category("Pages with tab characters"))
end
-- Unencoded character(s) in title.
local IDS = list_to_set{"⿰", "⿱", "⿲", "⿳", "⿴", "⿵", "⿶", "⿷", "⿸", "⿹", "⿺", "⿻", "", "", "", "", ""}
for char in pairs(explode_pagename) do
if IDS[char] and char ~= data.pagename then
insert(data.cats, "Terms containing unencoded characters")
break
end
end
-- Raw wikitext use of a topic or langname category. Also check if any raw sortkeys have been used.
do
local wikitext_topic_cat = {}
local wikitext_langname_cat = {}
local raw_sortkey
local langnames = mw.loadData("Module:languages/canonical names")
local etym_langnames = mw.loadData("Module:etymology languages/canonical names")
-- If a raw sortkey has been found, add it to the relevant table.
-- If there's no table (or the index is just `true`), create one first.
local function add_cat_table(marker, sortkey, tbl)
if not sortkey then
tbl[marker] = tbl[marker] or true
return true
elseif type(tbl[marker]) ~= "table" then
tbl[marker] = {}
end
insert(tbl[marker], sortkey)
return true
end
local function do_iteration(name, sortkey, wikitext_langname_cat)
if langnames[name] then
return add_cat_table(name, sortkey, wikitext_langname_cat)
end
name = etym_langnames[name] and name or content_lang:lcfirst(name)
if etym_langnames[name] then
name = get_etym_lang(name):getFullName()
return add_cat_table(name, sortkey, wikitext_langname_cat)
end
end
local function process_category(cat, pipe)
if pipe == #cat then -- categories cannot end "|]]"
return
end
local title = new_title(pipe and cat:sub(1, pipe - 1) or cat)
if not (title and title.namespace == 14) then
return
end
cat = title.text
local sortkey = pipe and cat:sub(pipe + 1) or nil
local code = cat:match("^([%w%-.]+):")
if sortkey and not raw_sortkey then
raw_sortkey = get_category("Pages with raw sortkeys")
end
if code then
return add_cat_table(code, sortkey, wikitext_topic_cat)
end
-- Split by word.
cat = split(cat, " ", true)
-- Iterate over the category name, starting with the longest possible name and shaving off the first word until we find one. We do it this way because:
-- (a) Going from shortest to longest risks falsely matching (e.g.) German Low German categories as German.
-- (b) Checking the start of category names first risks falsely match (e.g.) Alsatian French as Alsatian (a variety of Alemannic German), not French.
-- If no matches are found, then check the start of the category name, shaving off the last word each iteration.
local cat_len, n, name, done = #cat, 1
repeat
name = concat(cat, " ", n, cat_len)
done = do_iteration(name, sortkey, wikitext_langname_cat)
if done then
return
end
n = n + 1
until n > cat_len
n = cat_len - 1
if n <= 0 then
return
end
repeat
name = concat(cat, " ", 1, n)
done = do_iteration(name, sortkey, wikitext_langname_cat)
if done then
return
end
n = n - 1
until n == 0
end
if content then
for cat in remove_comments(content)
:gsub("%[%[", "\1")
:gsub("]]", "\2")
:gmatch("\1([^\1\2]-[Cc][Aa][Tt][^\1\2]-:[^\1]-)\2") do
process_category(cat, cat:find("|", 1, true))
end
end
data.wikitext_topic_cat = wikitext_topic_cat
data.wikitext_langname_cat = wikitext_langname_cat
insert(data.cats, raw_sortkey)
end
return data
end
return export