Utilisateur:TptBot/djvuocr.py

Ce script permet de faire l’OCR de fichier djvu et en publie le résultat sur Wikisource. Il nécessite pywikipedia ainsi que Tesseract 3.00 installé (disponible dans touts les bons dépôts des distributions GNU/Linux avec ses packs de langues). Exemple de commande : python djvuocr.py -filelang:fra -pages:1-25 -djvu:Nom_du_djvu.djvu

djvuocr.py modifier

#!/usr/bin/python
# -*- coding: utf-8  -*-
"""	
This bot uploads text from ocr of a djvu files onto pages in the "Page" 
namespace.  It is intended to be used for Wikisource.

The following parameters are supported:

    -dry           If given, doesn't do any real changes, but only shows
                   what would have been changed.
    -ask           Ask for confirmation before uploading each page.
                   (Default: ask when overwriting pages)
    -djvu:...      Name of the djvu file on commons
    -index:...     Name of the index page
                   (Default: the djvu filename)
    -lang:...      The lang of the text like eng of fra
                   (Default: eng)
    -pages:<start>-<end> Page range to upload; <end> is optional

All other parameters will be regarded as part of the title of a single page,
and the bot will only work on that single page.
"""
#
# (C) Pywikipedia bot team, 2008-2010
#
# Distributed under the terms of the MIT license.
#
__version__ = '$Id: djvuocr.py 9246 2011-07-29 15:42:46Z xqt $'
import wikipedia as pywikibot
import os, sys, urllib
import config, codecs

class AppURLopener(urllib.FancyURLopener):
    version = 'Pywikipediabot/1.0'
urllib._urlopener = AppURLopener()

# This is required for the text that is shown when you run this script
# with the parameter -help.
docuReplacements = {
}


class DjVuOCRBot:
    # Edit summary message that should be used.
    # NOTE: Put a good description here, and add translations, if possible!
    msg = {
        'en': u'Robot: Creating page with text form from an OCR of the DjVu',
        'fr': u'Bot: Cree une page depuis un OCR du Djvu',
    }

    def __init__(self, djvu, index, pages, filelang='eng', ask=False, debug=False):
        """
        Constructor. Parameters:
        djvu : filename
        index : page name
        pages : page range
        """
        self.djvu = djvu
        self.index = index
        self.pages = pages
        self.dry = debug
        self.ask = ask
        self.filelang = filelang

    def NoOfImages(self):
        cmd = u"djvused -e 'n' \"%s\"" % (self.djvu)
        count = os.popen( cmd.encode(sys.stdout.encoding) ).readline().rstrip()
        count = int(count)
        pywikibot.output("page count = %d" % count)
        return count

    def PagesGenerator(self):
        start = 1
        end = self.NoOfImages()

        if self.pages:
            pos = self.pages.find('-')
            if pos != -1:
                start = int(self.pages[:pos])
                if pos < len(self.pages)-1:
                    end = int(self.pages[pos+1:])
            else:
                start = int(self.pages)
                end = start
        pywikibot.output(u"Processing pages %d-%d" % (start, end))
        return range(start, end+1)

    def run(self):
        # Set the edit summary message
        pywikibot.setAction(pywikibot.translate(pywikibot.getSite(), self.msg))

        linkingPage = pywikibot.Page(pywikibot.getSite(), self.index)
        self.prefix = linkingPage.titleWithoutNamespace()
        if self.prefix[0:6] == 'Liber:':
            self.prefix = self.prefix[6:]
        pywikibot.output(u"Using prefix %s" % self.prefix)
        gen = self.PagesGenerator()

        site = pywikibot.getSite()
        self.username = config.usernames[site.family.name][site.lang]

        for pageno in gen:
            pywikibot.output("Processing page %d" % pageno)
            self.treat(pageno)

    def get_page(self, pageno):
        pywikibot.output(unicode("fetching page %d" % (pageno)))
        cmd = u"ddjvu -format=tiff -page=%d \"%s\" \"%s.tiff\" " % (pageno, self.djvu, self.djvu)
        os.system ( cmd.encode(sys.stdout.encoding) )
	cmd = u"tesseract \"%s.tiff\" \"%s\" -l %s " % (self.djvu, self.djvu, self.filelang)
        os.system ( cmd.encode(sys.stdout.encoding) )
        f = codecs.open(u"%s.txt" % self.djvu, 'r', config.textfile_encoding, 'replace')
        djvu_text = f.read()
        f.close()
        os.remove(u"%s.txt" % self.djvu)
        os.remove(u"%s.tiff" % self.djvu)
        return djvu_text

    def treat(self, pageno):
        """
        Loads the given page, does some changes, and saves it.
        """
        site = pywikibot.getSite()
        page_namespace = site.family.namespaces[104][site.lang]
        page = pywikibot.Page(site, u'%s:%s/%d'
                              % (page_namespace, self.prefix, pageno))
        exists = page.exists()
        djvutxt = self.get_page(pageno)

        if not djvutxt:
            return

        text = u'<noinclude><pagequality level="1" user="%s" /><div class="pagetext">\n\n\n</noinclude>%s<noinclude><references/></div></noinclude>' % (self.username,djvutxt)

        # convert to wikisyntax
        # this adds a second line feed, which makes a new paragraph
        text = text.replace('�', "\n") # US /x1F
        text = text.replace('�', "\n") # GS /x1D
        text = text.replace('�', "\n") # FF /x0C

        # only save if something was changed
        # automatically ask if overwriting an existing page
        ask = self.ask

        if exists:
            ask = True
            old_text = page.get()
            if old_text == text:
                pywikibot.output(u"No changes were needed on %s" % page.aslink())
                return
        else:
            old_text = ''
        pywikibot.output(u"\n\n>>> \03{lightpurple}%s\03{default} <<<"
                         % page.title())
        pywikibot.showDiff(old_text, text)
        if self.dry:
            pywikibot.inputChoice(u'Dry mode... Press enter to continue', [],
                                  [], 'dummy')
            return
        if ask:
            choice = pywikibot.inputChoice(u'Do you want to accept these changes?', ['Yes', 'No'], ['y', 'N'], 'N')
        else:
            choice = 'y'
        if choice == 'y':
            try:
                # Save the page
                page.put_async(text)
            except pywikibot.LockedPage:
                pywikibot.output(u"Page %s is locked; skipping." % page.aslink())
            except pywikibot.EditConflict:
                pywikibot.output(u'Skipping %s because of edit conflict' % (page.title()))
            except pywikibot.SpamfilterError, error:
                pywikibot.output(u'Cannot change %s because of spam blacklist entry %s' % (page.title(), error.url))


def main():
    import os
    index = None
    djvu = None
    pages = None
    # what would have been changed.
    dry = False
    ask = False

    # Parse command line arguments
    for arg in pywikibot.handleArgs():
        if arg.startswith("-dry"):
            dry = True
        elif arg.startswith("-ask"):
            ask = True
        elif arg.startswith("-djvu:"):
            djvu = arg[6:]
        elif arg.startswith("-filelang:"):
            filelang = arg[10:]
        elif arg.startswith("-index:"):
            index = arg[7:]
        elif arg.startswith("-pages:"):
            pages = arg[7:]
        else:
            pywikibot.output(u"Unknown argument %s" % arg)

    # Check the djvu file exists
    if djvu:
        djvuPage = pywikibot.ImagePage(pywikibot.getSite(), 'Image:%s' % djvu)
        if djvuPage:
            pywikibot.output('Download of the djvu file')
            try:
                urllib.urlretrieve(djvuPage.fileUrl(), djvu)
            except:
                 pywikibot.output(u'The djvu file can\'t be downloaded')
            os.stat(djvu)
            if not index:
                import os.path
                index = os.path.basename(djvu)

    if djvu and index:
        site = pywikibot.getSite()
        index_page = pywikibot.Page(site, index)

        if site.family.name != 'wikisource':
            raise pywikibot.PageNotFound(u"Found family '%s'; Wikisource required." % site.family.name)

        if not index_page.exists() and index_page.namespace() == 0:
            index_namespace = pywikibot.Page(site, 'MediaWiki:Proofreadpage index namespace').get()

            index_page = pywikibot.Page(pywikibot.getSite(),
                                        u"%s:%s" % (index_namespace, index))
        if not index_page.exists():
            raise pywikibot.NoPage(u"Page '%s' does not exist" % index)
        pywikibot.output(u"OCRing text from %s to %s" % (djvu, index_page.aslink()) )
        bot = DjVuOCRBot(djvu, index, pages, filelang, ask, dry)
        bot.run()
    else:
        pywikibot.showHelp()
    os.remove(djvu)

if __name__ == "__main__":
    try:
        main()
    finally:
        pywikibot.stopme()