Jump to content

Module:WikitextParser

From Wikipedia, the free encyclopedia

-- Module:WikitextParser is a general-purpose wikitext parser-- Documentation and master version: https://en.wikipedia.org/wiki/Module:WikitextParser-- Authors: User:Sophivorus, User:Certes, User:Aidan9382, et al.-- License: CC-BY-SA-4.0localWikitextParser={}-- Private helper method to escape a string for use in regexeslocalfunctionescapeString(str)returnstr:gsub('[%^%$%(%)%.%[%]%*%+%-%?%%]','%%%0')end-- Get the lead section from the given wikitext-- The lead section is any content before the first section title.-- @param wikitext Required. Wikitext to parse.-- @return Wikitext of the lead section. May be empty if the lead section is empty.functionWikitextParser.getLead(wikitext)wikitext='\n'..wikitextwikitext=wikitext:gsub('\n==.*','')wikitext=mw.text.trim(wikitext)returnwikitextend-- Get the sections from the given wikitext-- This method doesn't get the lead section, use getLead for that-- @param wikitext Required. Wikitext to parse.-- @return Map from section title to section contentfunctionWikitextParser.getSections(wikitext)localsections={}wikitext='\n'..wikitext..'\n=='fortitleinwikitext:gmatch('\n==+ *([^=]-) *==+')dolocalsection=wikitext:match('\n==+ *'..escapeString(title)..' *==+(.-)\n==')section=mw.text.trim(section)sections[title]=sectionendreturnsectionsend-- Get a section from the given wikitext (including any subsections)-- If the given section title appears more than once, only the section of the first instance will be returned-- @param wikitext Required. Wikitext to parse.-- @param title Required. Title of the section-- @return Wikitext of the section, or nil if it isn't found. May be empty if the section is empty or contains only subsections.functionWikitextParser.getSection(wikitext,title)title=mw.text.trim(title)title=escapeString(title)wikitext='\n'..wikitext..'\n'locallevel,wikitext=wikitext:match('\n(==+) *'..title..' *==.-\n(.*)')ifwikitextthenlocalnextSection='\n=='..string.rep('=?',#level-2)..'[^=].*'wikitext=wikitext:gsub(nextSection,'')-- remove later sections at this level or higherwikitext=mw.text.trim(wikitext)returnwikitextendend-- Get the content of a <section> tag from the given wikitext.-- We can't use getTags because unlike all other tags, both opening and closing <section> tags are self-closing.-- @param wikitext Required. Wikitext to parse.-- @param name Required. Name of the <section> tag-- @return Content of the <section> tag, or nil if it isn't found. May be empty if the section tag is empty.functionWikitextParser.getSectionTag(wikitext,name)name=mw.text.trim(name)name=escapeString(name)wikitext=wikitext:match('< *section +begin *= *["\']? *'..name..' *["\']? */>(.-)< *section +end= *["\']? *'..name..' *["\']? */>')ifwikitextthenreturnmw.text.trim(wikitext)endend-- Get the lists from the given wikitext.-- @param wikitext Required. Wikitext to parse.-- @return Sequence of lists.functionWikitextParser.getLists(wikitext)locallists={}wikitext='\n'..wikitext..'\n\n'forlistinwikitext:gmatch('\n([*#].-)\n[^*#]')dotable.insert(lists,list)endreturnlistsend-- Get the paragraphs from the given wikitext.-- @param wikitext Required. Wikitext to parse.-- @return Sequence of paragraphs.functionWikitextParser.getParagraphs(wikitext)localparagraphs={}-- Remove non-paragraphswikitext='\n'..wikitext..'\n'wikitext=wikitext:gsub('\n[*#][^\n]*','')-- remove listswikitext=wikitext:gsub('\n%[%b[]%]\n','')-- remove files and categorieswikitext=wikitext:gsub('\n%b{} *\n','\n%0\n')-- add spacing between tables and block templateswikitext=wikitext:gsub('\n%b{} *\n','\n')-- remove tables and block templateswikitext=wikitext:gsub('\n==+[^=]+==+ *\n','\n')-- remove section titleswikitext=mw.text.trim(wikitext)forparagraphinmw.text.gsplit(wikitext,'\n\n+')doifmw.text.trim(paragraph)~=''thentable.insert(paragraphs,paragraph)endendreturnparagraphsend-- Get the templates from the given wikitext.-- @param wikitext Required. Wikitext to parse.-- @return Sequence of templates.functionWikitextParser.getTemplates(wikitext)localtemplates={}fortemplateinwikitext:gmatch('{%b{}}')doifwikitext:sub(1,3)~='{{#'then-- skip parser functions like #iftable.insert(templates,template)endendreturntemplatesend-- Get the requested template from the given wikitext.-- If the template appears more than once, only the first instance will be returned-- @param wikitext Required. Wikitext to parse.-- @param name Name of the template to get-- @return Wikitext of the template, or nil if it wasn't foundfunctionWikitextParser.getTemplate(wikitext,name)localtemplates=WikitextParser.getTemplates(wikitext)locallang=mw.language.getContentLanguage()for_,templateinpairs(templates)dolocaltemplateName=WikitextParser.getTemplateName(template)iflang:ucfirst(templateName)==lang:ucfirst(name)thenreturntemplateendendend-- Get name of the template from the given template wikitext.-- @param templateWikitext Required. Wikitext of the template to parse.-- @return Name of the template-- @todo Strip "Template:" namespace?functionWikitextParser.getTemplateName(templateWikitext)returntemplateWikitext:match('^{{ *([^}|\n]+)')end-- Get the parameters from the given template wikitext.-- @param templateWikitext Required. Wikitext of the template to parse.-- @return Map from parameter names to parameter values, NOT IN THE ORIGINAL ORDER.-- @return Order in which the parameters were parsed.functionWikitextParser.getTemplateParameters(templateWikitext)localparameters={}localparamOrder={}localparams=templateWikitext:match('{{[^|}]-|(.*)}}')ifparamsthen-- Temporarily replace pipes in subtemplates and links to avoid chaosforsubtemplateinparams:gmatch('{%b{}}')doparams=params:gsub(escapeString(subtemplate),subtemplate:gsub('.',{['%']='%%',['|']="@@:@@",['=']='@@_@@'}))endforlinkinparams:gmatch('%[%b[]%]')doparams=params:gsub(escapeString(link),link:gsub('.',{['%']='%%',['|']='@@:@@',['=']='@@_@@'}))endlocalcount=0localparts,name,valueforparaminmw.text.gsplit(params,'|')doparts=mw.text.split(param,'=')name=mw.text.trim(parts[1])if#parts==1thenvalue=namecount=count+1name=countelsevalue=table.concat(parts,'=',2);value=mw.text.trim(value)endvalue=value:gsub('@@_@@','=')value=value:gsub('@@:@@','|')parameters[name]=valuetable.insert(paramOrder,name)endendreturnparameters,paramOrderend-- Get the tags from the given wikitext.-- @param wikitext Required. Wikitext to parse.-- @return Sequence of tags.functionWikitextParser.getTags(wikitext)localtags={}localtag,tagName,tagEnd-- Don't match closing tags like </div>, comments like <!--foo-->, comparisons like 1<2 or things like <3fortagStart,tagOpeninwikitext:gmatch('()(<[^/!%d].->)')dotagName=WikitextParser.getTagName(tagOpen)-- If we're in a self-closing tag, like <ref name="foo" />, <references/>, <br/>, <br>, <hr>, etc.iftagOpen:match('<.-/>')ortagName=='br'ortagName=='hr'thentag=tagOpen-- If we're in a tag that may contain others like it, like <div> or <span>elseiftagName=='div'ortagName=='span'thenlocalposition=tagStart+#tagOpen-1localdepth=1whiledepth>0dotagEnd=wikitext:match('</ ?'..tagName..' ?>()',position)iftagEndthentagEnd=tagEnd-1elsebreak-- unclosed tagendposition=wikitext:match('()< ?'..tagName..'[ >]',position+1)ifnotpositionthenposition=tagEnd+1endifposition>tagEndthendepth=depth-1elsedepth=depth+1endendtag=wikitext:sub(tagStart,tagEnd)-- Else we're probably in tag that shouldn't contain others like it, like <math> or <strong>elsetagEnd=wikitext:match('</ ?'..tagName..' ?>()',tagStart)iftagEndthentag=wikitext:sub(tagStart,tagEnd-1)-- If no end tag is found, assume we matched something that wasn't a tag, like <no. 1>elsetag=nilendendtable.insert(tags,tag)endreturntagsend-- Get the name of the tag in the given wikitext-- @param tag Required. Tag to parse.-- @return Name of the tag or nil if not foundfunctionWikitextParser.getTagName(tagWikitext)localtagName=tagWikitext:match('^< *(.-)[ />]')iftagNamethentagName=tagName:lower()endreturntagNameend-- Get the value of an attribute in the given tag.-- @param tagWikitext Required. Wikitext of the tag to parse.-- @param attribute Required. Name of the attribute.-- @return Value of the attribute or nil if not foundfunctionWikitextParser.getTagAttribute(tagWikitext,attribute)returntagWikitext:match('^< *.- *[^/>]*'..attribute..' *= *["\']?([^"\'>]+)["\']?[ />]')end-- Get the content of the given tag.-- @param tagWikitext Required. Wikitext of the tag to parse.-- @return Content of the tag. May be empty if the tag is empty. Will be nil if the tag is self-closing.-- @todo May fail with nested tagsfunctionWikitextParser.getTagContent(tagWikitext,attribute)returntagWikitext:match('^<.->.-</.->')end-- Get the <gallery> tags from the given wikitext.-- @param wikitext Required. Wikitext to parse.-- @return Sequence of gallery tags.functionWikitextParser.getGalleries(wikitext)localgalleries={}localtags=WikitextParser.getTags(wikitext)for_,taginpairs(tags)dolocaltagName=WikitextParser.getTagName(tag)iftagName=='gallery'thentable.insert(galleries,tag)endendreturngalleriesend-- Get the <ref> tags from the given wikitext.-- @param wikitext Required. Wikitext to parse.-- @return Sequence of ref tags.functionWikitextParser.getReferences(wikitext)localreferences={}localtags=WikitextParser.getTags(wikitext)for_,taginpairs(tags)dolocaltagName=WikitextParser.getTagName(tag)iftagName=='ref'thentable.insert(references,tag)endendreturnreferencesend-- Get the reference with the given name from the given wikitext.-- @param wikitext Required. Wikitext to parse.-- @param referenceName Required. Name of the reference.-- @return Wikitext of the referencefunctionWikitextParser.getReference(wikitext,referenceName)localreferences=WikitextParser.getReferences(wikitext)for_,referenceinpairs(references)dolocalcontent=WikitextParser.getTagContent(reference)localname=WikitextParser.getTagAttribute(reference,'name')ifcontentandname==referenceNamethenreturnreferenceendendend-- Get the tables from the given wikitext.-- @param wikitext Required. Wikitext to parse.-- @return Sequence of tables.functionWikitextParser.getTables(wikitext)localtables={}wikitext='\n'..wikitextfortinwikitext:gmatch('\n%b{}')doift:sub(1,3)=='\n{|'thent=mw.text.trim(t)-- exclude the leading newlinetable.insert(tables,t)endendreturntablesend-- Get the id from the given table wikitext-- @param tableWikitext Required. Wikitext of the table to parse.-- @param attribute Required. Name of the attribute.-- @return Value of the attribute or nil is not foundfunctionWikitextParser.getTableAttribute(tableWikitext,attribute)returntableWikitext:match('^{|[^\n]*'..attribute..' *= *["\']?([^"\'\n]+)["\']?[^\n]*\n')end-- Get a table by id from the given wikitext-- @param wikitext Required. Wikitext to parse.-- @param id Required. Id of the table-- @return Wikitext of the table or nil if not foundfunctionWikitextParser.getTable(wikitext,id)localtables=WikitextParser.getTables(wikitext)for_,tinpairs(tables)doifid==WikitextParser.getTableAttribute(t,'id')thenreturntendendend-- Get the data from the given table wikitext-- @param tableWikitext Required. Wikitext of the table to parse.-- @return Table data-- @todo Test and make more robustfunctionWikitextParser.getTableData(tableWikitext)localtableData={}tableWikitext=mw.text.trim(tableWikitext);tableWikitext=tableWikitext:gsub('^{|.-\n','')-- remove the headertableWikitext=tableWikitext:gsub('\n|}$','')-- remove the footertableWikitext=tableWikitext:gsub('^|%+.-\n','')-- remove any captiontableWikitext=tableWikitext:gsub('|%-.-\n','|-\n')-- remove any row attributestableWikitext=tableWikitext:gsub('^|%-\n','')-- remove any leading empty rowtableWikitext=tableWikitext:gsub('\n|%-$','')-- remove any trailing empty rowforrowWikitextinmw.text.gsplit('|-',true)dolocalrowData={}rowWikitext=rowWikitext:gsub('||','\n|')rowWikitext=rowWikitext:gsub('!!','\n|')rowWikitext=rowWikitext:gsub('\n!','\n|')rowWikitext=rowWikitext:gsub('^!','\n|')rowWikitext=rowWikitext:gsub('^\n|','')forcellWikitextinmw.text.gsplit(rowWikitext,'\n|')docellWikitext=mw.text.trim(cellWikitext)table.insert(rowData,cellWikitext)endtable.insert(tableData,rowData)endreturntableDataend-- Get the internal links from the given wikitext (includes category and file links).-- @param wikitext Required. Wikitext to parse.-- @return Sequence of internal links.functionWikitextParser.getLinks(wikitext)locallinks={}forlinkinwikitext:gmatch('%[%b[]%]')dotable.insert(links,link)endreturnlinksend-- Get the file links from the given wikitext.-- @param wikitext Required. Wikitext to parse.-- @return Sequence of file links.functionWikitextParser.getFiles(wikitext)localfiles={}locallinks=WikitextParser.getLinks(wikitext)for_,linkinpairs(links)dolocalnamespace=link:match('^%[%[ *(.-) *:')ifnamespaceandmw.site.namespaces[namespace]andmw.site.namespaces[namespace].canonicalName=='File'thentable.insert(files,link)endendreturnfilesend-- Get name of the file from the given file wikitext.-- @param fileWikitext Required. Wikitext of the file to parse.-- @return Name of the filefunctionWikitextParser.getFileName(fileWikitext)returnfileWikitext:match('^%[%[ *.- *: *(.-) *[]|]')end-- Get the category links from the given wikitext.-- @param wikitext Required. Wikitext to parse.-- @return Sequence of category links.functionWikitextParser.getCategories(wikitext)localcategories={}locallinks=WikitextParser.getLinks(wikitext)for_,linkinpairs(links)dolocalnamespace=link:match('^%[%[ -(.-) -:')ifnamespaceandmw.site.namespaces[namespace]andmw.site.namespaces[namespace].canonicalName=='Category'thentable.insert(categories,link)endendreturncategoriesend-- Get the external links from the given wikitext.-- @param wikitext Required. Wikitext to parse.-- @return Sequence of external links.functionWikitextParser.getExternalLinks(wikitext)locallinks={}forlinkinwikitext:gmatch('%b[]')doiflink:match('^%[//')orlink:match('^%[https?://')thentable.insert(links,link)endendreturnlinksendreturnWikitextParser
close