User:TalBot/xo pp fix.py

#! /usr/bin/env python
# _*_ coding: utf8 _*_
#
# Fix extra stuff before Executive Orders and Presidential Proclamations
#
# run with args "-log -putthrottle:xx"
#
# Copyright (C) 2007, GrafZahl (en.wikisource.org user)
#
# Licence: GPLv2
#

import pagegenerators, re, wikipedia

wikipedia.get_throttle.setDelay(5)

# Handle args

args = wikipedia.handleArgs()

for arg in args:
        wikipedia.output(u'(WWW) Ignoring unrecognised argument: %s' % arg)

# Basic text tokens

summ = u'Removing garbage before {{header}}'

# Regexes

header_xp = re.compile(r'\{\{\s*[Hh]eader')

# page generators

xo_pages = pagegenerators.PrefixingPageGenerator(u'Executive Order')
pp_pages = pagegenerators.PrefixingPageGenerator(u'Proclamation')

# Procedure to fix extra stuff before header

def fix_stuff_before_header(page):
	wikipedia.output(u'(III) Checking [[%s]]' % page.title())
	if(page.isRedirectPage()):
		wikipedia.output(u'   (III) Skipping page, redirect')
		return
	text = page.get()
	match = header_xp.search(text)
	if(match == None):
		wikipedia.output(u'   (III) Skipping page, no header')
		return
	newtext = text[match.start():]
	if newtext != text:
		wikipedia.output(u'   (III) Removing garbage before header')
		page.put(newtext, summ, minorEdit = False)
	return

# check pages

for page in xo_pages:
	fix_stuff_before_header(page)

for page in pp_pages:
	fix_stuff_before_header(page)