User:Visviva/authors.py

import xml.etree.cElementTree as ET
import urllib2
import urllib
import time
import re

rooturl = "http://en.wikisource.org/w/api.php?"

class AuthorUpdater:
	def __init__(self):
		pass
	
	def update_authors(self,all_authors=False):
		# get wikitext of all author-index pages
		querystring = "action=query&generator=allpages&gaplimit=50&gapfilterredir=nonredirects&gapprefix=Authors-&prop=revisions&gapnamespace=4&rvprop=content&format=xml"
		url = rooturl+querystring
		page = download_page(url)
		# create set of linked authors
		lines = [x for x in page.split("\n") if "Author:" in x]
		self.indexed_authors = set()
		for line in lines:
			author = line.split("Author:")[1].split("|")[0].split("]")[0]
			self.indexed_authors.add(author)

		# get list of all Author: pages
		namespace = "102"
		querystring = "action=query&list=allpages&apnamespace=%s&aplimit=500&apfilterredir=nonredirects&format=xml" % namespace
		url = rooturl+querystring
		node = recursive_download(url).find("query").find("allpages")
		self.all_authors = set([dict(x.items())["title"].encode("utf-8").split("Author:")[1] for x in node])

		# create set of unlinked authors
		if all_authors is False:
			self.unlinked = self.all_authors - self.indexed_authors
		else: 
			self.unlinked = self.all_authors
		
		self.process_lists(self.unlinked)

		# digest wikitext for surname, firstname, birthyear, deathyear
		# dict of fullname:4-tuple pairs
		self.dixion = {}
		for authorname in self.texts.keys():
			if "/" in authorname: continue # subpage
			self.textion = self.texts[authorname]
			if "{{author" not in self.textion.lower(): continue
			out_list=[]
			for param in ["lastname","firstname","birthyear","deathyear"]:
				out_list.append(self.get_param_value(param,self.textion))
			out_tuple=tuple(out_list)
			self.dixion[authorname]=out_tuple

		# generate wikitext list of missing authors
		self.sortable = [(self.dixion[x][0].upper()+self.dixion[x][1].upper(),x) for x in self.dixion.keys()]
		self.sortable.sort()
		self.sorted = [x[1] for x in self.sortable]
		self.out_lines = self.collate(self.sorted,self.dixion)
		self.output = "\n".join(self.out_lines)
		return self.output

	def process_lists(self,unlinked):
		# get wikitext of all unlinked authors
		# have to do this in chunks
		thelist=list(unlinked)
		thelist=["Author:"+x for x in thelist]
		self.texts={}
		while thelist:
			print len(thelist)
			chunk = [urllib.quote(x) for x in thelist[:50]]
			thelist = thelist[50:]
			batch = "|".join(chunk)
			try: print batch
			except: print "unprintable"
			querystring = "action=query&prop=revisions&titles=%s&rvprop=content&format=xml" % batch
			url = rooturl+querystring
			self.url = url
			page = download_page(url)
			for result in ET.XML(page).find("query").find("pages"):
				text = result.find("revisions").find("rev").text
				if not text: continue
				text = text.encode("utf-8")
				title = dict(result.items())["title"].encode("utf-8").split("Author:")[1]
				self.texts[title] = text
			time.sleep(60)
		return self.texts

		
	def collate(self,sorted,dixion):
		lines=[]
		firstletter_before=""
		for key in sorted:
			namestring = key
			datestring = "" # defaults
			data = dixion[key]
			if data[0] and data[1]:
				namestring = "%s, %s" % (data[0],data[1])
			elif data[0] and not data[1]: # occasionally users choose to only explicitly specify surname
				if data[0] in key:
					firstname = key.replace(data[0],"").strip()
					namestring = "%s, %s" % (data[0],firstname)
			elif data[1] and not data[0]: # does this ever happen?
				if data[1] in key:
					surname = key.replace(data[1],"").strip()
					namestring = "%s, %s" % (surname,data[1])
			
			if data[2] or data[3]:
				datestring = "(%s – %s)" % (data[2],data[3])
			line = "*[[Author:%s|%s]] %s" % (key,namestring,datestring)
			firstletter_now = namestring[0].upper()
			if firstletter_before != firstletter_now: 
				print firstletter_before, firstletter_now
				extraline = "\n==%s==\n" % firstletter_now
				lines.append(extraline)
			firstletter_before = firstletter_now
			lines.append(line)
		return lines
		
	def get_param_value(self,param,wikitext):
		if param not in wikitext: return ""
		try: 
			output=wikitext.split(param)[1].split("\n")[0].split("=")[1].split("|")[0].strip()
			if "<!--" in output:
				if "-->" in output:
					output=output.split("<!--")[0]+output.split("-->")[1]
				else:
					output=output.split("<!--")[0]
				output=output.strip()
			return output
		except:
			return ""
	
def recursive_download(baseurl):
	firstpage = download_page(baseurl)
	page = firstpage
	node = ET.XML(firstpage)
	continuer = node.find("query-continue")
	more_items_exist = bool(continuer)
	while more_items_exist:
		param = list(continuer)[0].items()[0][0]
		startfrom = list(continuer)[0].items()[0][1]
		startfrom = urllib.quote(startfrom.encode("utf-8"))
		url = baseurl+"&"+param+"="+startfrom
		print startfrom, str(len(list(node.find("query"))[0]))
		newpage = download_page(url)
		newnode = ET.XML(newpage)
		continuer = newnode.find("query-continue")
		more_items_exist = bool(continuer)
		for item in list(list(newnode.find("query"))[0]):
			list(node.find("query"))[0].append(item)
		time.sleep(60)
	return node
		

def download_page(url):
		page = ""
		while not page:
			try: 
				page = urllib2.urlopen(url).read()
				if not page:
					break
			except:
				time.sleep(60)
				continue
		return page