Utilisateur:Phe/Python/typo page.py

# -*- coding: utf-8 -*-

import botpywi
import sys
import re
import wikipedia
import page_cache
import query_ext

try:
    import psyco
    psyco.full()
except ImportError:
    pass

cache = page_cache.page_cache()

# many things could be merged in one regexp, but I want to be able to
# test them separetly
def treat_text_part(text):
    # apostrophe
    if True:
        # Well ... It's difficult to do it properly if the text is already
        # wikified
        text = re.sub(u"([^'])['‘]([^'])", u'\\1’\\2', text)

    if True:
        # incise en début de phrase --> cadratin, dangereux, des incises
        # peuvent se trouver en début de ligne (mais il n'est pas faux
        # d'utiliser le cadratin dans ce cas ?)
        text = text.replace(u"\n–", u"\n—")

        # double "-" en début de ligne mais pas quadruple "-"
        #text = text.replace(u"\n--", u"\n—")
        text = re.sub(u"\n--(?!--)", u"\n—", text)
        # tiret en début de phrase --> dialogue (mais pas quadruple "-")
        #text = text.replace(u"\n-", u"\n—")
        text = re.sub(u"\n-(?!---)", u"\n—", text)
        # et deux char utilisés pour les filets.
        text = text.replace(u"\n─", u"\n—")
        text = text.replace(u"\n―", u"\n—")

        # some char followed by a space
        text = re.sub(u"—([^ ])", u"— \\1", text)

    text = re.sub(u'[ ]([,.])', u'\\1', text)
    text = re.sub(u'\.\.\.[.]*', u'…', text)

    text = re.sub(u'([^ \s])([;:!?])', u'\\1 \\2', text)
    text = re.sub(u'([«;:!?])([^ \s…])', u'\\1 \\2', text)
    # separated from the previous regexp else "word!»" overlap
    text = re.sub(u'([^ \s])([»])', u'\\1 \\2', text)

    # workaround some buggy text
    text = re.sub(u'([;:!?»]) \n', u'\\1\n', text)
    # <&nbsp;><space>
    #text = re.sub(u'  ([;:!?»])', u' \\1', text)
    text = re.sub(u'([;:!?»]) <br />', u'\\1<br />', text)
    #text = re.sub(u'([;:!?»]) <div>\n', u'\\1\n', text)
    return text

def treat_text(old_text):
    last_match = 0
    new_text = u''
    # FIXME: avoid all html entity
    for it in re.finditer(u'(<(div|span)[^>]*?>|&nbsp;|&mdash;|<!--.*?-->|\n:[:]*|\n;[;]*|\[\[[^]|]*)', old_text):
        #print it.group(1)
        new_text += treat_text_part(old_text[last_match:it.start(1)])
        new_text += old_text[it.start(1):it.end(1)]
        last_match = it.end(1)
    new_text += treat_text_part(old_text[last_match:])
    new_text = re.sub(u'(?ms)[\s]+(<noinclude>.*?</noinclude>)$', u'\\1',
                      new_text)
    return new_text

# FIXME: factorize
def compare_title(a, b):
    first = int(re.match(u'.*/(\d+)', a).group(1))
    second = int(re.match(u'.*/(\d+)', b).group(1))
    if first > second:
        return 1
    elif first < second:
        return -1
    else:
        return 0

def main(gen):
    titles = []
    for p in gen:
        titles.append(p[u'title'])

    if re.match(u'.*/\d+$', titles[0]):
        titles.sort(compare_title)
    titles = [ wikipedia.Page(site = site, title = x) for x in titles ]
    cache.mass_load(titles)

    for p in titles:
        text = cache.read_page(p.title(), site = p.site())
        new_text = treat_text(text)
        if new_text.strip() != text.strip():
            print p.title()
            wikipedia.showDiff(text, new_text)
            choice = wikipedia.inputChoice(u'Next, Quit, Count',
                                           ['Next', 'upload' ],
                                           [ 'N', 'u'], 'n')
            if choice == 'u':
                cache.write_page(p.title(), new_text, u'Typographie',
                                 site = p.site())

if __name__ == '__main__':
    try:
        class Options:
            pass

        site = wikipedia.getSite(code = 'fr', fam = 'wikisource')

        for arg in sys.argv[:]:
            if arg == '-help':
                print sys.argv[0], "-help"
            elif arg.startswith('-start:'):
                pagename = unicode(arg[len('-start:'):], 'utf-8')
                gen = query_ext.PreloadingPagesStartswith(pagename, site = site)
            else:
                gen = [ { u'title': unicode(arg, 'utf-8') } ]

        main(gen)
    finally:
        wikipedia.stopme()
        cache.save()