From 61173c1b58efa79c0ba6b08348d2796a249d0186 Mon Sep 17 00:00:00 2001 From: Norbert Thiebaud Date: Sat, 1 Sep 2012 09:51:27 -0500 Subject: move help structure one directory up Change-Id: Ie970e39fbb6795a92d9fdd13510409d7dcd071bc --- to-wiki/wikiconv2.py | 1383 ++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 1383 insertions(+) create mode 100755 to-wiki/wikiconv2.py (limited to 'to-wiki/wikiconv2.py') diff --git a/to-wiki/wikiconv2.py b/to-wiki/wikiconv2.py new file mode 100755 index 0000000000..762484d225 --- /dev/null +++ b/to-wiki/wikiconv2.py @@ -0,0 +1,1383 @@ +#!/usr/bin/env python + +import os, sys, thread, threading, time +import xml.parsers.expat +import codecs +from threading import Thread + +root="source/" +max_threads = 25 + +titles = [] + +# map of id -> localized text +localization_data = {} + +# to collect a list of pages that will be redirections to the pages with nice +# names +redirects = [] + +# to collect images that we will up-load later +images = set() + +# various types of paragraphs +replace_paragraph_role = \ + {'start':{'bascode': '', + 'code': '', + 'codeintip': '', + 'emph' : '', # must be empty to be able to strip empty + 'example': '', + 'heading1': '= ', + 'heading2': '== ', + 'heading3': '=== ', + 'heading4': '==== ', + 'heading5': '===== ', + 'heading6': '====== ', + 'head1': '= ', # used only in one file, probably in error? + 'head2': '== ', # used only in one file, probably in error? + 'listitem': '', + 'note': '{{Note|', + 'null': '', # special paragraph for Variable, CaseInline, etc. + 'paragraph': '', + 'related': '', # used only in one file, probably in error? + 'relatedtopics': '', # used only in one file, probably in error? + 'sup' : '', + 'tablecontent': '| | ', + 'tablecontentcode': '| | ', + 'tablehead': '! scope="col" | ', + 'tablenextpara': '\n', + 'tablenextparacode': '\n', + 'tip': '{{Tip|', + 'variable': '', + 'warning': '{{Warning|', + }, + 'end':{'bascode': '\n', + 'code': '\n\n', + 'codeintip': '\n\n', + 'emph' : '', + 'example': '\n\n', + 'heading1': ' =\n\n', + 'heading2': ' ==\n\n', + 'heading3': ' ===\n\n', + 'heading4': ' ====\n\n', + 'heading5': ' =====\n\n', + 'heading6': ' ======\n\n', + 'head1': ' =\n\n', # used only in one file, probably in error? + 'head2': ' ==\n\n', # used only in one file, probably in error? + 'listitem': '', + 'note': '}}\n\n', + 'null': '', # special paragraph for Variable, CaseInline, etc. + 'paragraph': '\n\n', + 'related': '\n\n', # used only in one file, probably in error? + 'relatedtopics': '\n\n', # used only in one file, probably in error? + 'sup' : '', + 'tablecontent': '\n', + 'tablecontentcode': '\n', + 'tablehead': '\n', + 'tablenextpara': '\n', + 'tablenextparacode': '\n', + 'tip': '}}\n\n', + 'variable': '', + 'warning': '}}\n\n', + }, + 'templ':{'bascode': False, + 'code': False, + 'codeintip': False, + 'emph' : False, + 'example': False, + 'heading1': False, + 'heading2': False, + 'heading3': False, + 'heading4': False, + 'heading5': False, + 'heading6': False, + 'head1': False, + 'head2': False, + 'listitem': False, + 'note': True, + 'null': False, + 'paragraph': False, + 'related': False, + 'relatedtopics': False, + 'sup' : False, + 'tablecontent': False, + 'tablecontentcode': False, + 'tablehead': False, + 'tablenextpara': False, + 'tablenextparacode': False, + 'tip': True, + 'variable': False, + 'warning': True, + } + } + +section_id_mapping = \ + {'relatedtopics': 'RelatedTopics'} + +# text snippets that we need to convert +replace_text_list = \ + [["$[officename]", "{{ProductName}}"], + ["%PRODUCTNAME", "{{ProductName}}"], + ["$PRODUCTNAME", "{{ProductName}}"] + ] + +def get_link_filename(link, name): + text = link.strip() + fragment = '' + if text.find('http') == 0: + text = name + else: + f = text.find('#') + if f >= 0: + fragment = text[f:] + text = text[0:f] + + for title in titles: + try: + if title[0].find(text) >= 0: + return (title[1].strip(), fragment) + except: + pass + return (link, '') + +def replace_text(text): + for i in replace_text_list: + if text.find(i[0]) >= 0: + text = text.replace(i[0],i[1]) + return text + +# modify the text so that in templates like {{Name|something}}, the 'something' +# does not look like template params +def escape_equals_sign(text): + depth = 0 + t = '' + for i in text: + if i == '=': + if depth == 0: + t = t + '=' + else: + t = t + '=' + else: + t = t + i + if i == '{' or i == '[' or i == '<': + depth = depth + 1 + elif i == '}' or i == ']' or i == '>': + depth = depth - 1 + if depth < 0: + depth = 0 + + return t + +def load_localization_data(sdf_file): + global localization_data + localization_data = {} + try: + file = codecs.open(sdf_file, "r", "utf-8") + except: + sys.stderr.write('Error: Cannot open .sdf file "%s"\n'% sdf_file) + return False + + for line in file: + line = line.strip() + if line[0] == '#': + continue + spl = line.split("\t") + + # the form of the key is like + # source/text/shared/explorer/database/02010100.xhp#hd_id3149233 + # otherwise we are getting duplicates + key = '%s#%s'% (spl[1].replace('\\', '/'), spl[4]) + try: + localization_data[key] = spl[10] + except: + sys.stderr.write('Warning: Ignored line "%s"\n'% line.encode('utf-8')) + + file.close() + return True + +def unescape(str): + unescape_map = {'<': {True:'<', False:'<'}, + '>': {True:'>', False:'>'}, + '&': {True:'&', False:'&'}, + '"': {True:'"', False:'"'}} + result = '' + escape = False + for c in str: + if c == '\\': + if escape: + result = result + '\\' + escape = False + else: + escape = True + else: + try: + replace = unescape_map[c] + result = result + replace[escape] + except: + result = result + c + escape = False + + return result + +def get_localized_text(filename, id): + try: + str = localization_data['%s#%s'% (filename, id)] + except: + return '' + + return unescape(str) + +def href_to_fname_id(href): + link = href.replace('"', '') + fname = link + id = '' + if link.find("#") >= 0: + fname = link[:link.find("#")] + id = link[link.find("#")+1:] + else: + sys.stderr.write('Reference without a "#" in "%s".'% link) + + return [fname, id] + +# Base class for all the elements +# +# self.name - name of the element, to drop the self.child_parsing flag +# self.objects - collects the child objects that are constructed during +# parsing of the child elements +# self.child_parsing - flag whether we are parsing a child, or the object +# itself +# self.parent - parent object +class ElementBase: + def __init__(self, name, parent): + self.name = name + self.objects = [] + self.child_parsing = False + self.parent = parent + + def start_element(self, parser, name, attrs): + pass + + def end_element(self, parser, name): + if name == self.name: + self.parent.child_parsing = False + + def char_data(self, parser, data): + pass + + def get_curobj(self): + if self.child_parsing: + return self.objects[len(self.objects)-1].get_curobj() + return self + + # start parsing a child element + def parse_child(self, child): + self.child_parsing = True + self.objects.append(child) + + # construct the wiki representation of this object, including the objects + # held in self.objects (here only the text of the objects) + def get_all(self): + text = u'' + for i in self.objects: + text = text + i.get_all() + return text + + # for handling variables, and embedding in general + # id - the variable name we want to get + def get_variable(self, id): + for i in self.objects: + if i != None: + var = i.get_variable(id) + if var != None: + return var + return None + + # embed part of another file into current structure + def embed_href(self, parent_parser, fname, id): + # parse another xhp + parser = XhpParser('source/' + fname, False, \ + parent_parser.current_app, parent_parser.wiki_page_name, \ + parent_parser.lang) + var = parser.get_variable(id) + + if var != None: + try: + if var.role == 'variable': + var.role = 'paragraph' + except: + pass + self.objects.append(var) + elif parser.follow_embed: + sys.stderr.write('Cannot find reference "#%s" in "%s".\n'% \ + (id, fname)) + + def unhandled_element(self, parser, name): + sys.stderr.write('Warning: Unhandled element "%s" in "%s" (%s)\n'% \ + (name, self.name, parser.filename)) + +# Base class for trivial elements that operate on char_data +# +# Like , or +class TextElementBase(ElementBase): + def __init__(self, attrs, parent, element_name, start, end, templ): + ElementBase.__init__(self, element_name, parent) + self.text = u'' + self.start = start + self.end = end + self.templ = templ + + def char_data(self, parser, data): + self.text = self.text + data + + def get_all(self): + if self.templ: + return self.start + escape_equals_sign(replace_text(self.text)) + self.end + else: + return self.start + replace_text(self.text) + self.end + +class XhpFile(ElementBase): + def __init__(self): + ElementBase.__init__(self, None, None) + + def start_element(self, parser, name, attrs): + if name == 'body': + # ignored, we flatten the structure + pass + elif name == 'bookmark': + self.parse_child(Bookmark(attrs, self, 'div', parser)) + elif name == 'comment': + self.parse_child(Comment(attrs, self)) + elif name == 'embed' or name == 'embedvar': + if parser.follow_embed: + (fname, id) = href_to_fname_id(attrs['href']) + self.embed_href(parser, fname, id) + elif name == 'helpdocument': + # ignored, we flatten the structure + pass + elif name == 'list': + self.parse_child(List(attrs, self)) + elif name == 'meta': + self.parse_child(Meta(attrs, self)) + elif name == 'paragraph': + parser.parse_paragraph(attrs, self) + elif name == 'section': + self.parse_child(Section(attrs, self)) + elif name == 'sort': + self.parse_child(Sort(attrs, self)) + elif name == 'switch': + self.parse_child(Switch(attrs, self, parser.embedding_app)) + elif name == 'table': + self.parse_child(Table(attrs, self)) + elif name == 'bascode': + self.parse_child(BasicCode(attrs, self)) + else: + self.unhandled_element(parser, name) + +class Bookmark(ElementBase): + def __init__(self, attrs, parent, type, parser): + ElementBase.__init__(self, 'bookmark', parent) + + self.type = type + + self.id = attrs['id'] + self.app = '' + self.redirect = '' + self.target = '' + self.authoritative = False + + # let's construct the name of the redirect, so that we can point + # to the wikihelp directly from the LO code; wiki then takes care of + # the correct redirect + branch = attrs['branch'] + if branch.find('hid/') == 0 and (parser.current_app_raw != '' or parser.follow_embed): + name = branch[branch.find('/') + 1:] + + self.app = parser.current_app_raw + self.target = parser.wiki_page_name + self.authoritative = parser.follow_embed + self.redirect = name + + def get_all(self): + global redirects + # first of all, we need to create a redirect page for this one + if self.redirect != '' and self.target != '': + redirects.append([self.app, self.redirect, \ + '%s#%s'% (self.target, self.id), \ + self.authoritative]) + + # then we also have to setup ID inside the page + if self.type == 'div': + return '<div id="%s"></div>\n'% self.id + elif self.type == 'span': + return '<span id="%s"></span>'% self.id + else: + sys.stderr.write('Unknown bookmark type "%s"'% self.type) + + return '' + +class Image(ElementBase): + def __init__(self, attrs, parent): + ElementBase.__init__(self, 'image', parent) + self.src = attrs['src'] + self.align = 'left' + self.alt = False + self.alttext = "" + + def start_element(self, parser, name, attrs): + if name == 'alt': + self.alt = True + else: + self.unhandled_element(parser, name) + + def end_element(self, parser, name): + ElementBase.end_element(self, parser, name) + + if name == 'alt': + self.alt = False + + def char_data(self, parser, data): + if self.alt: + self.alttext = self.alttext + data + + def get_all(self): + global images + images.add(self.src) + + name = self.src[self.src.rfind('/') + 1:] + wikitext = "[[Image:"+name+"|border|"+self.align+"|" + wikitext = wikitext + self.alttext+"]]" + return wikitext + + def get_curobj(self): + return self + +class Br(TextElementBase): + def __init__(self, attrs, parent): + TextElementBase.__init__(self, attrs, parent, 'br', '<br/>', '', False) + +class Comment(TextElementBase): + def __init__(self, attrs, parent): + TextElementBase.__init__(self, attrs, parent, 'comment', '<!-- ', ' -->', False) + +class HelpIdMissing(TextElementBase): + def __init__(self, attrs, parent): + TextElementBase.__init__(self, attrs, parent, 'help-id-missing', '{{MissingHelpId}}', '', False) + +class Text: + def __init__(self, text): + self.wikitext = replace_text(text) + + def get_all(self): + return self.wikitext + + def get_variable(self, id): + return None + +class TableCell(ElementBase): + def __init__(self, attrs, parent): + ElementBase.__init__(self, 'tablecell', parent) + self.cellHasChildElement = False + + def start_element(self, parser, name, attrs): + self.cellHasChildElement = True + if name == 'bookmark': + self.parse_child(Bookmark(attrs, self, 'div', parser)) + elif name == 'comment': + self.parse_child(Comment(attrs, self)) + elif name == 'embed' or name == 'embedvar': + (fname, id) = href_to_fname_id(attrs['href']) + if parser.follow_embed: + self.embed_href(parser, fname, id) + elif name == 'paragraph': + parser.parse_localized_paragraph(TableContentParagraph(attrs, self), attrs, self) + elif name == 'section': + self.parse_child(Section(attrs, self)) + elif name == 'bascode': + # ignored, do not syntax highlight in table cells + pass + else: + self.unhandled_element(parser, name) + + def get_all(self): + text = '' + if not self.cellHasChildElement: # an empty element + if self.parent.isTableHeader: # get from TableRow Element + role = 'tablehead' + else: + role = 'tablecontent' + text = text + replace_paragraph_role['start'][role] + text = text + replace_paragraph_role['end'][role] + text = text + ElementBase.get_all(self) + return text + +class TableRow(ElementBase): + def __init__(self, attrs, parent): + ElementBase.__init__(self, 'tablerow', parent) + + def start_element(self, parser, name, attrs): + if name == 'tablecell': + self.parse_child(TableCell(attrs, self)) + else: + self.unhandled_element(parser, name) + + def get_all(self): + text = '|-\n' + ElementBase.get_all(self) + return text + +class BasicCode(ElementBase): + def __init__(self, attrs, parent): + ElementBase.__init__(self, 'bascode', parent) + + def start_element(self, parser, name, attrs): + if name == 'paragraph': + parser.parse_localized_paragraph(BasicCodeParagraph(attrs, self), attrs, self) + else: + self.unhandled_element(parser, name) + + def get_all(self): + text = '<source lang="oobas">\n' + ElementBase.get_all(self) + '</source>\n\n' + return text + +class Table(ElementBase): + def __init__(self, attrs, parent): + ElementBase.__init__(self, 'table', parent) + + def start_element(self, parser, name, attrs): + if name == 'comment': + self.parse_child(Comment(attrs, self)) + elif name == 'tablerow': + self.parse_child(TableRow(attrs, self)) + else: + self.unhandled_element(parser, name) + + def get_all(self): + # + ' align="left"' etc.? + text = '{| class="wikitable"\n' + \ + ElementBase.get_all(self) + \ + '|}\n\n' + return text + +class ListItem(ElementBase): + def __init__(self, attrs, parent): + ElementBase.__init__(self, 'listitem', parent) + + def start_element(self, parser, name, attrs): + if name == 'bookmark': + self.parse_child(Bookmark(attrs, self, 'span', parser)) + elif name == 'embed' or name == 'embedvar': + (fname, id) = href_to_fname_id(attrs['href']) + if parser.follow_embed: + self.embed_href(parser, fname, id) + elif name == 'paragraph': + parser.parse_localized_paragraph(ListItemParagraph(attrs, self), attrs, self) + else: + self.unhandled_element(parser, name) + + def get_all(self): + text = '*' + postfix = '\n' + if self.parent.startwith > 0: + text = '<li>' + postfix = '</li>' + elif self.parent.type == 'ordered': + text = '#' + + # add the text itself + linebreak = False + for i in self.objects: + if linebreak: + text = text + '<br/>' + text = text + i.get_all() + linebreak = True + + return text + postfix + +class List(ElementBase): + def __init__(self, attrs, parent): + ElementBase.__init__(self, 'list', parent) + + self.type = attrs['type'] + try: + self.startwith = int(attrs['startwith']) + except: + self.startwith = 0 + + def start_element(self, parser, name, attrs): + if name == 'listitem': + self.parse_child(ListItem(attrs, self)) + else: + self.unhandled_element(parser, name) + + def get_all(self): + text = "" + if self.startwith > 0: + text = text + '<ol start="%d">\n'% self.startwith + + text = text + ElementBase.get_all(self) + + if self.startwith > 0: + text = text + '\n</ol>\n' + else: + text = text + '\n' + return text + +# To handle elements that should be completely ignored +class Ignore(ElementBase): + def __init__(self, attrs, parent, element_name): + ElementBase.__init__(self, element_name, parent) + +class OrigTitle(TextElementBase): + def __init__(self, attrs, parent): + TextElementBase.__init__(self, attrs, parent, 'title', '{{OrigLang|', '}}\n', True) + +class Title(TextElementBase): + def __init__(self, attrs, parent, localized_title): + TextElementBase.__init__(self, attrs, parent, 'title', '{{Lang|', '}}\n', True) + self.localized_title = localized_title + + def get_all(self): + if self.localized_title != '': + self.text = self.localized_title + return TextElementBase.get_all(self) + +class Topic(ElementBase): + def __init__(self, attrs, parent): + ElementBase.__init__(self, 'topic', parent) + + def start_element(self, parser, name, attrs): + if name == 'title': + if parser.lang == '': + self.parse_child(OrigTitle(attrs, self)) + else: + self.parse_child(Title(attrs, self, get_localized_text(parser.filename, 'tit'))) + elif name == 'filename': + self.parse_child(Ignore(attrs, self, name)) + else: + self.unhandled_element(parser, name) + +class Meta(ElementBase): + def __init__(self, attrs, parent): + ElementBase.__init__(self, 'meta', parent) + + def start_element(self, parser, name, attrs): + if name == 'topic': + self.parse_child(Topic(attrs, self)) + elif name == 'history' or name == 'lastedited': + self.parse_child(Ignore(attrs, self, name)) + else: + self.unhandled_element(parser, name) + +class Section(ElementBase): + def __init__(self, attrs, parent): + ElementBase.__init__(self, 'section', parent) + self.id = attrs['id'] + + def start_element(self, parser, name, attrs): + if name == 'bookmark': + self.parse_child(Bookmark(attrs, self, 'div', parser)) + elif name == 'comment': + self.parse_child(Comment(attrs, self)) + elif name == 'embed' or name == 'embedvar': + (fname, id) = href_to_fname_id(attrs['href']) + if parser.follow_embed: + self.embed_href(parser, fname, id) + elif name == 'list': + self.parse_child(List(attrs, self)) + elif name == 'paragraph': + parser.parse_paragraph(attrs, self) + elif name == 'section': + # sections can be nested + self.parse_child(Section(attrs, self)) + elif name == 'switch': + self.parse_child(Switch(attrs, self, parser.embedding_app)) + elif name == 'table': + self.parse_child(Table(attrs, self)) + elif name == 'bascode': + self.parse_child(BasicCode(attrs, self)) + else: + self.unhandled_element(parser, name) + + def get_all(self): + mapping = '' + try: + mapping = section_id_mapping[self.id] + except: + pass + + # some of the section ids are used as real id's, some of them have + # function (like relatetopics), and have to be templatized + text = '' + if mapping != '': + text = '{{%s|%s}}\n\n'% (mapping, \ + escape_equals_sign(ElementBase.get_all(self))) + else: + text = ElementBase.get_all(self) + + return text + + def get_variable(self, id): + var = ElementBase.get_variable(self, id) + if var != None: + return var + if id == self.id: + return self + return None + +class Sort(ElementBase): + def __init__(self, attrs, parent): + ElementBase.__init__(self, 'sort', parent) + + try: + self.order = attrs['order'] + except: + self.order = 'asc' + + def start_element(self, parser, name, attrs): + if name == 'section': + self.parse_child(Section(attrs, self)) + else: + self.unhandled_element(parser, name) + + def get_all(self): + rev = False + if self.order == 'asc': + rev = True + self.objects = sorted(self.objects, key=lambda obj: obj.id, reverse=rev) + + return ElementBase.get_all(self) + +class Link(ElementBase): + def __init__(self, attrs, parent, lang): + ElementBase.__init__(self, 'link', parent) + + self.link = attrs['href'] + try: + self.lname = attrs['name'] + except: + self.lname = self.link[self.link.rfind("/")+1:] + # Override lname + self.default_name = self.lname + (self.lname, self.fragment) = get_link_filename(self.link, self.lname) + self.wikitext = "" + self.lang = lang + + def char_data(self, parser, data): + self.wikitext = self.wikitext + data + + def get_all(self): + if self.wikitext == "": + self.wikitext = self.default_name + + self.wikitext = replace_text(self.wikitext) + if self.link.find("http") == 0: + text = '[%s %s]'% (self.link, self.wikitext) + elif self.lang != '': + text = '[[%s/%s%s|%s]]'% (self.lname, self.lang, self.fragment, self.wikitext) + else: + text = '[[%s%s|%s]]'% (self.lname, self.fragment, self.wikitext) + return text + +class SwitchInline(ElementBase): + def __init__(self, attrs, parent, app): + ElementBase.__init__(self, 'switchinline', parent) + self.switch = attrs['select'] + self.embedding_app = app + + def start_element(self, parser, name, attrs): + if name == 'caseinline': + self.parse_child(CaseInline(attrs, self, False)) + elif name == 'defaultinline': + self.parse_child(CaseInline(attrs, self, True)) + else: + self.unhandled_element(parser, name) + + def get_all(self): + if len(self.objects) == 0: + return '' + elif self.switch == 'sys': + system = {'MAC':'', 'UNIX':'', 'WIN':'', 'default':''} + for i in self.objects: + if i.case == 'MAC' or i.case == 'UNIX' or \ + i.case == 'WIN' or i.case == 'default': + system[i.case] = i.get_all() + elif i.case == 'OS2': + # ignore, there is only one mention of OS2, which is a + # 'note to translators', and no meat + pass + elif i.case == 'HIDE_HERE': + # do what the name suggest ;-) + pass + else: + sys.stderr.write('Unhandled "%s" case in "sys" switchinline.\n'% \ + i.case ) + text = '{{System' + for i in [['default', 'default'], ['MAC', 'mac'], \ + ['UNIX', 'unx'], ['WIN', 'win']]: + if system[i[0]] != '': + text = '%s|%s=%s'% (text, i[1], system[i[0]]) + return text + '}}' + elif self.switch == 'appl': + # we want directly use the right text, when inlining something + # 'shared' into an 'app' + if self.embedding_app == '': + text = '' + default = '' + for i in self.objects: + appls = {'BASIC':'Basic', 'CALC':'Calc', \ + 'CHART':'Chart', 'DRAW':'Draw', \ + 'IMAGE':'Draw', 'IMPRESS': 'Impress', \ + 'MATH':'Math', 'WRITER':'Writer', \ + 'OFFICE':'', 'default':''} + try: + app = appls[i.case] + all = i.get_all() + if all == '': + pass + elif app == '': + default = all + else: + text = text + '{{WhenIn%s|%s}}'% (app, escape_equals_sign(all)) + except: + sys.stderr.write('Unhandled "%s" case in "appl" switchinline.\n'% \ + i.case) + + if text == '': + text = default + elif default != '': + text = text + '{{WhenDefault|%s}}'% escape_equals_sign(default) + + return text + else: + for i in self.objects: + if i.case == self.embedding_app: + return i.get_all() + + return '' + +class Case(ElementBase): + def __init__(self, attrs, parent, is_default): + ElementBase.__init__(self, 'case', parent) + + if is_default: + self.name = 'default' + self.case = 'default' + else: + self.case = attrs['select'] + + def start_element(self, parser, name, attrs): + if name == 'bookmark': + self.parse_child(Bookmark(attrs, self, 'div', parser)) + elif name == 'comment': + self.parse_child(Comment(attrs, self)) + elif name == 'embed' or name == 'embedvar': + if parser.follow_embed: + (fname, id) = href_to_fname_id(attrs['href']) + self.embed_href(parser, fname, id) + elif name == 'list': + self.parse_child(List(attrs, self)) + elif name == 'paragraph': + parser.parse_paragraph(attrs, self) + elif name == 'section': + self.parse_child(Section(attrs, self)) + elif name == 'table': + self.parse_child(Table(attrs, self)) + else: + self.unhandled_element(parser, name) + +class Switch(SwitchInline): + def __init__(self, attrs, parent, app): + SwitchInline.__init__(self, attrs, parent, app) + self.name = 'switch' + + def start_element(self, parser, name, attrs): + self.embedding_app = parser.embedding_app + if name == 'case': + self.parse_child(Case(attrs, self, False)) + elif name == 'default': + self.parse_child(Case(attrs, self, True)) + else: + self.unhandled_element(parser, name) + +class Item(ElementBase): + replace_type = \ + {'start':{'input': '<code>', + 'keycode': '{{KeyCode|', + 'tasto': '{{KeyCode|', + 'litera': '<code>', + 'literal': '<code>', + 'menuitem': '{{MenuItem|', + 'mwnuitem': '{{MenuItem|', + 'OpenOffice.org': '', + 'productname': '', + 'unknown': '<code>' + }, + 'end':{'input': '</code>', + 'keycode': '}}', + 'tasto': '}}', + 'litera': '</code>', + 'literal': '</code>', + 'menuitem': '}}', + 'mwnuitem': '}}', + 'OpenOffice.org': '', + 'productname': '', + 'unknown': '</code>' + }, + 'templ':{'input': False, + 'keycode': True, + 'tasto': True, + 'litera': False, + 'literal': False, + 'menuitem': True, + 'mwnuitem': True, + 'OpenOffice.org': False, + 'productname': False, + 'unknown': False + }} + + def __init__(self, attrs, parent): + ElementBase.__init__(self, 'item', parent) + + try: + self.type = attrs['type'] + except: + self.type = 'unknown' + self.text = '' + + def char_data(self, parser, data): + self.text = self.text + data + + def get_all(self): + try: + text = '' + if self.replace_type['templ'][self.type]: + text = escape_equals_sign(replace_text(self.text)) + else: + text = replace_text(self.text) + return self.replace_type['start'][self.type] + \ + text + \ + self.replace_type['end'][self.type] + except: + sys.stderr.write('Unhandled item type "%s".\n'% self.type) + + return replace_text(self.text) + + +class Paragraph(ElementBase): + def __init__(self, attrs, parent): + ElementBase.__init__(self, 'paragraph', parent) + + try: + self.role = attrs['role'] + except: + self.role = 'paragraph' + + try: + self.id = attrs['id'] + except: + self.id = "" + + try: + self.level = int(attrs['level']) + except: + self.level = 0 + + self.is_first = (len(self.parent.objects) == 0) + + def start_element(self, parser, name, attrs): + if name == 'ahelp': + try: + if attrs['visibility'] == 'hidden': + self.parse_child(Ignore(attrs, self, name)) + except: + pass + elif name == 'br': + self.parse_child(Br(attrs, self)) + elif name == 'comment': + self.parse_child(Comment(attrs, self)) + elif name == 'emph': + self.parse_child(Emph(attrs, self)) + elif name == 'sup': + self.parse_child(Sup(attrs, self)) + elif name == 'embedvar': + if parser.follow_embed: + (fname, id) = href_to_fname_id(attrs['href']) + self.embed_href(parser, fname, id) + elif name == 'help-id-missing': + self.parse_child(HelpIdMissing(attrs, self)) + elif name == 'image': + self.parse_child(Image(attrs, self)) + elif name == 'item': + self.parse_child(Item(attrs, self)) + elif name == 'link': + self.parse_child(Link(attrs, self, parser.lang)) + elif name == 'localized': + # we ignore this tag, it is added arbitrary for the paragraphs + # that come from .sdf files + pass + elif name == 'switchinline': + self.parse_child(SwitchInline(attrs, self, parser.embedding_app)) + elif name == 'variable': + self.parse_child(Variable(attrs, self)) + else: + self.unhandled_element(parser, name) + + def char_data(self, parser, data): + if self.role == 'paragraph' or self.role == 'heading' or \ + self.role == 'listitem' or self.role == 'variable': + if data != '' and data[0] == ' ': + data = ' ' + data.lstrip() + data = data.replace('\n', ' ') + + if len(data): + self.objects.append(Text(data)) + + def get_all(self): + role = self.role + if role == 'heading': + if self.level <= 0: + sys.stderr.write('Heading, but the level is %d.\n'% self.level) + elif self.level < 6: + role = 'heading%d'% self.level + else: + role = 'heading6' + + # if we are not the first para in the table, we need special handling + if not self.is_first and role.find('table') == 0: + if role == 'tablecontentcode': + role = 'tablenextparacode' + else: + role = 'tablenextpara' + + # the text itself + children = ElementBase.get_all(self) + if self.role != 'emph' and self.role != 'bascode': + children = children.strip() + + if len(children) == 0: + return '' + + # prepend the markup according to the role + text = '' + try: + text = text + replace_paragraph_role['start'][role] + except: + sys.stderr.write( "Unknown paragraph role start: " + role + "\n" ) + + if replace_paragraph_role['templ'][role]: + text = text + escape_equals_sign(children) + else: + text = text + children + + # append the markup according to the role + try: + text = text + replace_paragraph_role['end'][role] + except: + sys.stderr.write( "Unknown paragraph role end: " + role + "\n" ) + + return text + +class Variable(Paragraph): + def __init__(self, attrs, parent): + Paragraph.__init__(self, attrs, parent) + self.name = 'variable' + self.role = 'variable' + self.id = attrs['id'] + + def get_variable(self, id): + if id == self.id: + return self + return None + +class CaseInline(Paragraph): + def __init__(self, attrs, parent, is_default): + Paragraph.__init__(self, attrs, parent) + + self.role = 'null' + if is_default: + self.name = 'defaultinline' + self.case = 'default' + else: + self.name = 'caseinline' + self.case = attrs['select'] + +class Emph(Paragraph): + def __init__(self, attrs, parent): + Paragraph.__init__(self, attrs, parent) + self.name = 'emph' + self.role = 'emph' + + def get_all(self): + text = Paragraph.get_all(self) + if len(text): + return "'''" + text + "'''" + return '' + +class Sup(Paragraph): + def __init__(self, attrs, parent): + Paragraph.__init__(self, attrs, parent) + self.name = 'sup' + self.role = 'sup' + + def get_all(self): + text = Paragraph.get_all(self) + if len(text): + return "<sup>" + text + "</sup>" + return '' + +class ListItemParagraph(Paragraph): + def __init__(self, attrs, parent): + Paragraph.__init__(self, attrs, parent) + self.role = 'listitem' + +class BasicCodeParagraph(Paragraph): + def __init__(self, attrs, parent): + Paragraph.__init__(self, attrs, parent) + self.role = 'bascode' + +class TableContentParagraph(Paragraph): + def __init__(self, attrs, parent): + Paragraph.__init__(self, attrs, parent) + if self.role != 'tablehead' and self.role != 'tablecontent': + if self.role == 'code': + self.role = 'tablecontentcode' + elif self.role == 'bascode': + self.role = 'tablecontentcode' + else: + self.role = 'tablecontent' + if self.role == 'tablehead': + self.parent.parent.isTableHeader = True # self.parent.parent is TableRow Element + else: + self.parent.parent.isTableHeader = False + +class ParserBase: + def __init__(self, filename, follow_embed, embedding_app, current_app, wiki_page_name, lang, head_object, buffer): + self.filename = filename + self.follow_embed = follow_embed + self.embedding_app = embedding_app + self.current_app = current_app + self.wiki_page_name = wiki_page_name + self.lang = lang + self.head_obj = head_object + + p = xml.parsers.expat.ParserCreate() + p.StartElementHandler = self.start_element + p.EndElementHandler = self.end_element + p.CharacterDataHandler = self.char_data + + p.Parse(buffer) + + def start_element(self, name, attrs): + self.head_obj.get_curobj().start_element(self, name, attrs) + + def end_element(self, name): + self.head_obj.get_curobj().end_element(self, name) + + def char_data(self, data): + self.head_obj.get_curobj().char_data(self, data) + + def get_all(self): + return self.head_obj.get_all() + + def get_variable(self, id): + return self.head_obj.get_variable(id) + + def parse_localized_paragraph(self, paragraph, attrs, obj): + localized_text = '' + try: + localized_text = get_localized_text(self.filename, attrs['id']) + except: + pass + + if localized_text != '': + # parse the localized text + text = u'<?xml version="1.0" encoding="UTF-8"?><localized>' + localized_text + '</localized>' + ParserBase(self.filename, self.follow_embed, self.embedding_app, \ + self.current_app, self.wiki_page_name, self.lang, \ + paragraph, text.encode('utf-8')) + # add it to the overall structure + obj.objects.append(paragraph) + # and ignore the original text + obj.parse_child(Ignore(attrs, obj, 'paragraph')) + else: + obj.parse_child(paragraph) + + def parse_paragraph(self, attrs, obj): + ignore_this = False + try: + if attrs['role'] == 'heading' and int(attrs['level']) == 1 \ + and self.ignore_heading and self.follow_embed: + self.ignore_heading = False + ignore_this = True + except: + pass + + if ignore_this: + obj.parse_child(Ignore(attrs, obj, 'paragraph')) + else: + self.parse_localized_paragraph(Paragraph(attrs, obj), attrs, obj) + +class XhpParser(ParserBase): + def __init__(self, filename, follow_embed, embedding_app, wiki_page_name, lang): + # we want to ignore the 1st level="1" heading, because in most of the + # cases, it is the only level="1" heading in the file, and it is the + # same as the page title + self.ignore_heading = True + + current_app = '' + self.current_app_raw = '' + for i in [['sbasic', 'BASIC'], ['scalc', 'CALC'], \ + ['sdatabase', 'DATABASE'], ['sdraw', 'DRAW'], \ + ['schart', 'CHART'], ['simpress', 'IMPRESS'], \ + ['smath', 'MATH'], ['swriter', 'WRITER']]: + if filename.find('/%s/'% i[0]) >= 0: + self.current_app_raw = i[0] + current_app = i[1] + break + + if embedding_app == '': + embedding_app = current_app + + file = codecs.open(filename, "r", "utf-8") + buf = file.read() + file.close() + + ParserBase.__init__(self, filename, follow_embed, embedding_app, + current_app, wiki_page_name, lang, XhpFile(), buf.encode('utf-8')) + +def loadallfiles(filename): + global titles + titles = [] + file = codecs.open(filename, "r", "utf-8") + for line in file: + title = line.split(";", 2) + titles.append(title) + file.close() + +class WikiConverter(Thread): + def __init__(self, inputfile, wiki_page_name, lang, outputfile): + Thread.__init__(self) + self.inputfile = inputfile + self.wiki_page_name = wiki_page_name + self.lang = lang + self.outputfile = outputfile + + def run(self): + parser = XhpParser(self.inputfile, True, '', self.wiki_page_name, self.lang) + file = codecs.open(self.outputfile, "wb", "utf-8") + file.write(parser.get_all()) + file.close() + +def write_link(r, target): + fname = 'wiki/%s'% r + try: + file = open(fname, "w") + file.write('#REDIRECT [[%s]]\n'% target) + file.close() + except: + sys.stderr.write('Unable to write "%s".\n'%'wiki/%s'% fname) + +def write_redirects(): + print 'Generating the redirects...' + written = {} + # in the first pass, immediately writte the links that are embedded, so that + # we can always point to that source versions + for redir in redirects: + app = redir[0] + redirect = redir[1] + target = redir[2] + authoritative = redir[3] + + if app != '': + r = '%s/%s'% (app, redirect) + if authoritative: + write_link(r, target) + written[r] = True + else: + try: + written[r] + except: + written[r] = False + + # in the second pass, output the wiki links + for redir in redirects: + app = redir[0] + redirect = redir[1] + target = redir[2] + + if app == '': + for i in ['swriter', 'scalc', 'simpress', 'sdraw', 'smath', \ + 'schart', 'sbasic', 'sdatabase']: + write_link('%s/%s'% (i, redirect), target) + else: + r = '%s/%s'% (app, redirect) + if not written[r]: + write_link(r, target) + +# Main Function +def convert(generate_redirects, lang, sdf_file): + if lang == '': + print 'Generating the main wiki pages...' + else: + print 'Generating the wiki pages for language %s...'% lang + + global redirects + redirects = [] + global images + images = set() + + loadallfiles("alltitles.csv") + + if lang != '': + sys.stderr.write('Using localizations from "%s"\n'% sdf_file) + if not load_localization_data(sdf_file): + return + + for title in titles: + while threading.active_count() > max_threads: + time.sleep(0.001) + + infile = title[0].strip() + wikiname = title[1].strip() + articledir = 'wiki/' + wikiname + try: + os.mkdir(articledir) + except: + pass + + outfile = '' + if lang != '': + wikiname = '%s/%s'% (wikiname, lang) + outfile = '%s/%s'% (articledir, lang) + else: + outfile = '%s/MAIN'% articledir + + try: + file = open(outfile, 'r') + except: + try: + wiki = WikiConverter(infile, wikiname, lang, outfile) + wiki.start() + continue + except: + print 'Failed to convert "%s" into "%s".\n'% \ + (infile, outfile) + sys.stderr.write('Warning: Skipping: %s > %s\n'% (infile, outfile)) + file.close() + + # wait for everyone to finish + while threading.active_count() > 1: + time.sleep(0.001) + + if lang == '': + # set of the images used here + print 'Generating "images.txt", the list of used images...' + file = open('images.txt', "w") + for image in images: + file.write('%s\n'% image) + file.close() + + # generate the redirects + if generate_redirects: + write_redirects() + +# vim:set shiftwidth=4 softtabstop=4 expandtab: -- cgit