User:Inductiveload/Scripts/Page namespace editor

import pw_script_header
import wikipedia
import codecs
import re

FIX = 'newline'
FILE= r'/home/john/src/pw/zz_filelist0.txt'
SUMMARY =  "[bot] Tidying formatting."

def decomposePage(wikiText):

    regex = re.compile(ur'(?ms)^<noinclude>(.*)</noinclude>(.*?)<noinclude>(.*)</noinclude>$')
    m = regex.search(wikiText)

    if m:
        header =  m.group(1)
        body   =  m.group(2)
        footer =  m.group(3)
        return header, body, footer

    else:
        print "Can't find header, body, footer"
        return None


def composePage(header, body, footer):

    return '<noinclude>%s</noinclude>%s<noinclude>%s</noinclude>'%(header, body, footer)

def process_body(body):

    body = re.sub(ur'([^\n]) *\n([^\n])', ur'\1 \2', body)

    return body

def process_header(header):
    return header

def process_footer(footer):
    return footer

def main():

    in_file = codecs.open(FILE, 'r', 'utf-8')

    ws_site = wikipedia.getSite("en", "wikisource")

    for page_title in in_file:

        print '(INF) Processing page: %s' % page_title

        page = wikipedia.Page(ws_site, page_title) # get the page
        old_wikitext = page.get() #extract wikitext

        header, body, footer = decomposePage(old_wikitext) #decompose the page

        body = process_body(body) #process the body
        header = process_header(header) #header
        footer = process_footer(footer) #footer

        new_wikitext = composePage(header, body, footer) # make a well formed Page: namespace page

        wikipedia.showDiff(old_wikitext, new_wikitext)

        print new_wikitext

        cont = raw_input("Upload? [y/n]: ")
        #cont = 'y'
        if cont in ['y','Y','yes','Yes']:
            page.put(new_wikitext, SUMMARY, minorEdit=True)

if __name__ == "__main__":
    main()