#!/usr/bin/python
# -*- coding: utf-8 -*-
#import botpywi
import catlib
import pagegenerators
import wikipedia
import re
import sys
import time
def add_to_log(msg, opt):
p = wikipedia.Page(opt.site, u"User:Phe/Log move page")
text = p.get()
text += msg
try:
p.put(text, u'Journal des erreurs de déplacement')
return True
except wikipedia.EditConflict:
return False
log_msg = u''
def flush_log(opt):
global log_msg
if not log_msg:
return
if not add_to_log(u'\n' + log_msg, opt):
time.sleep(10)
if not add_to_log(u'\n' + log_msg, opt):
print u'ERROR: Unable to log last error message'
log_msg = u''
log_msg = u''
def log_error(msg, page):
err_msg = u'Error, page: [[%s]], %s' % (page.title(), msg)
print err_msg
global log_msg
log_msg += u'*' + err_msg + u'\n'
def regex_from_title(old_title):
title = u''
for ch in old_title:
if ch in u'()*.?+.':
title += u'\\'
title += ch
title = title.replace(u' ', u'[ _]+')
return title
def move_one_page(page, new_title, opt):
if not page.exists():
return
print 'move page from;', page.title(), 'to:', new_title
if not opt.dry_run:
# Passing leaveRedirect = false to move() is more efficient but is
# not enough stable actually (2009/10) and anyway it breaks things
# until we have corrected redirect in a later step. So we defer
# deletion of created redirect later.
# Leaving the redirect in place make also the process more robust,
# if the script stop on an exception it'll remain page to correct and
# redirect to delete but nothing will be broken on the wiki (no red
# link but link through redirect will exist)
while True:
# Should not be required but safer, as the throttle from
# pywikipedia is too small for a move operation.
time.sleep(10)
if page.move(new_title, u'Conventions sur les titres'):
break
print "move fail, sleeping 30 seconds"
page = wikipedia.Page(page.site(), page.title())
time.sleep(30)
# in pages to modify it can exists pages that has been moved.
# We fixup pagenames to get the real pagename as we can't use
# p.get(follow_redirect = False) because some of the linked page can be
# redirect existing before the move. Leaving a redirect in place doesn't
# fix the problem as we will edit the redirect instead of the target.
def fixup_pagename(pages, titles):
results = set()
for p in pages:
found = False
for t in titles:
if p.title() == t[0]:
found = True
results.add(wikipedia.Page(p.site(), t[1]))
if not found:
results.add(p)
return results
def fix_redirect(pages, titles, opt):
if not opt.dry_run:
pages = fixup_pagename(pages, titles)
pages = pagegenerators.PreloadingGenerator(pages)
for p in pages:
text = p.get(get_redirect = True)
new_text = text
for t in titles:
regexp = u'\[\[[ ]*' + regex_from_title(t[0]) + u'[ ]*(\||\]|#)'
repl = u'[[' + t[1] + u'\\1'
new_text = re.sub(regexp, repl, new_text)
if opt.dry_run:
print 'Changing', p.title()
wikipedia.showDiff(text, new_text)
if not opt.dry_run:
p.put(new_text,
comment = u'Correction des redirects après renommage')
def delete_redirect(p, opt):
if p.exists():
if p.isRedirectPage():
print 'deleting page:', p.title()
if len(list(p.getReferences())):
if not opt.dry_run:
log_error(u'skipping deletion, linked page exists, please fix linked pages and delete the redirect manually', p)
else:
if not opt.dry_run:
if not p.delete(u'Nettoyage après correction des liens',
prompt = False):
log_error(u'Impossible de détruire la page', p)
else:
if not opt.dry_run:
log_error(u'skipping deletion, page is not a redirect, check linked page before deletion', p)
def delete_all_redirect(titles, opt):
if opt.delete_redirect:
for t in titles:
p = wikipedia.Page(opt.site, t)
delete_redirect(p, opt)
elif not opt.dry_run:
pages = [wikipedia.Page(opt.site, x) for x in titles]
for p in pagegenerators.PreloadingGenerator(pages):
if p.exists():
log_error(u'redirect created, check linked page before deletion', p)
# The two next function must be changed to change the naming scheme, this one
# change the title, the next function must return True if a title is a
# candidate for a change
def change_title(title):
title = title.replace(u' - ', u'/')
#title = title.replace(u', ', u'/')
#title = title.replace(u' : ', u'/')
# always change these two, at least on fr:
title = title.replace(u"'", u'’')
title = title.replace(u'‘', u'’')
#title = title.replace(u'Don Quichotte/', u'L’Ingénieux Hidalgo Don Quichotte de la Manche/')
#title = title.replace(u":", u'/')
#title = title.replace(u' attiques', u' Attiques')
#title = title.replace(u" ruisseau, ", u' ruisseau/')
## if title.endswith(u'.'):
## title = title[:-1]
#title = title.replace(u'dernier des flibustiers', u'Dernier des flibustiers')
#title = title.replace(u' coup ', u' Coup ')
#title = title.replace(u', Chapitre ', u'/Chapitre ')
#title = re.sub(u', (.*?), (.*)$', u'/\\1/\\2', title)
#title = re.sub(u'\.(\d+)$', u'/\\1', title)
#title = re.sub(u' (\d+) (\d+)$', u'/\\1/\\2', title)
#title = re.sub(u',(\d+)$', u'/\\1', title)
return title
def filter_title(title):
return re.match(u".*('|‘| - ).*", title)
def move_pages(gen, opt):
pages = set()
titles = set()
seen = set()
for page in gen:
if page.namespace() != 0:
continue
old_title = page.title()
old_title = old_title.split(u'#')[0]
# Possible with link to the same page with different section anchor
if old_title in seen:
continue
seen.add(old_title)
if not filter_title(old_title):
continue
new_title = change_title(old_title)
# don't try to move a page to itself
if old_title == new_title:
continue
# get linked page first, to avoid problem on getting linked page of a
# page moved a few seconds ago.
pages = pages.union(pagegenerators.ReferringPageGenerator(page))
move_one_page(page, new_title, opt)
titles.add( (old_title, new_title) )
talk_page = page.toggleTalkPage()
new_title_talk_page = change_title(talk_page.title())
titles.add( (talk_page.title(), new_title_talk_page) )
if opt.fix_redirect and opt.sync_between_page:
fix_redirect(pages, titles, opt)
pages = set()
titles = set()
delete_all_redirect([ x[0] for x in titles], opt)
flush_log(opt)
if opt.fix_redirect and not opt.sync_between_page:
fix_redirect(pages, titles, opt)
delete_all_redirect([ x[0] for x in titles], opt)
flush_log(opt)
if __name__ == '__main__':
try:
class Options:
pass
options = Options()
options.dry_run = False
options.fix_redirect = False
options.sync_between_page = True
options.delete_redirect = False
gen = None
options.site = wikipedia.getSite(code = 'fr', fam = 'wikisource')
for arg in sys.argv[1:]:
if arg == '-dry-run':
options.dry_run = True
elif arg.startswith('-cat:'):
cat_name = unicode(arg[len('-cat:'):], u'utf-8')
cat = catlib.Category(options.site, cat_name)
gen = pagegenerators.CategorizedPageGenerator(cat)
options.sync_between_page = True
elif arg.startswith('-prefix:'):
prefix_name = unicode(arg[len('-prefix;'):], 'utf-8')
gen = pagegenerators.PrefixingPageGenerator(prefix_name, includeredirects = False, site = options.site)
options.sync_between_page = False
elif arg.startswith('-links:'):
pagename = unicode(arg[len('-links:'):], 'utf-8')
page = wikipedia.Page(options.site, pagename)
gen = pagegenerators.LinkedPageGenerator(page)
options.sync_between_page = False
elif arg.startswith('-page:'):
pagename = unicode(arg[len('-page:'):], 'utf-8')
gen = [ wikipedia.Page(options.site, pagename) ]
options.sync_between_page = False
elif arg == '-fix-redirect':
options.fix_redirect = True
elif arg == '-delete-redirect':
options.delete_redirect = True
elif arg == '-help':
print sys.argv[0], '[-dry-run | -help | -cat:cat_name | -prefix:prefix_name | -fix-redirect | -delete-redirect ]'
sys.exit(1)
if options.delete_redirect:
if not options.site.isAllowed('delete', True):
print 'You asked to delete redirect but have not enough right todo that'
sys.exit(1)
if not options.fix_redirect:
print 'You asked to delete redirect but not to -fix-redirect'
sys.exit(1)
move_pages(gen, options)
finally:
flush_log(options)
wikipedia.stopme()