User:Ineuw/proofreading.js

Note: After saving, changes may not occur immediately. Click here to learn how to bypass your browser's cache.
Firefox / Safari: Hold Shift while clicking Reload, or press either Ctrl-F5 or Ctrl-R (Cmd-R on a Mac)
Google Chrome: Press Ctrl-Shift-R (Cmd-Shift-R on a Mac)
Internet Explorer: Hold Ctrl while clicking Refresh, or press Ctrl-F5
Opera: Clear the cache in Tools → Preferences
For details and instructions about other browsers, see Wikipedia:Bypass your cache.
Code that you insert on this page could contain malicious content capable of compromising your account. If you are unsure whether code you are adding to this page is safe, you can ask at the central discussion page, Scriptorium. The code will be executed when previewing this page under some skins, including Monobook. You can Purge this page in the interim if you wish to refresh the content sooner under another skin.
Documentation for this script can be added at User:Ineuw/proofreading.
// <nowiki>

/*
This page defines a TemplateScript library. It's not meant to be referenced
directly. See [[Wikisource:TemplateScript]] for usage.
*/

/* global $, pathoschild */

/**
 * TemplateScript adds configurable templates and scripts to the sidebar, and adds an example regex editor.
 * @see https://meta.wikimedia.org/wiki/TemplateScript
 * @update-token [[File:Pathoschild/templatescript.js]]
 */
// <nowiki>
$.ajax('//tools-static.wmflabs.org/meta/scripts/pathoschild.templatescript.js', { dataType:'script', cache:true }).then(function() {
	/*********
	** Define library
	*********/
	pathoschild.TemplateScript.library.define({
		key: 'wikisource.proofreading',
		name: 'Proofreading tools',
		url: '//en.wikisource.org/wiki/Wikisource:TemplateScript#Proofreading',
		description: 'A set of scripts for <a href="/wiki/Help:Proofreading">proofreading works in the <tt>Page:</tt> namespace</a>. This includes tools for cleaning up OCR, generating page 		templates, and adding common text formatting.',
		categories: [
			{
				name: 'Page tools',
				scripts: [
					// { key: 'add-header', name: 'Add header', script: function(editor) { addPageHeader(editor); }, forNamespaces: 'page' },
					// { key: 'add-footer', name: 'Add footer', script: function(editor) { addPageFooter(editor); }, forNamespaces: 'page' },

					{ key: 'cleanup-ocr', name: 'Clean up OCR', script: function(editor) { pageCleanup(editor); }, forNamespaces: 'page' }

					// { key: 'make-ref', name: 'Make reference', script: function(editor) { makeReference(editor); }, forNamespaces: 'page' },
					// { key: 'smallcaps', name: 'Convert to small-caps', script: function(editor) { smallcaps(editor); }, forNamespaces: 'page' },
					// { key: 'uppercase', name: 'Convert to uppercase', script: function(editor) { upper(editor); }, forNamespaces: 'page' }
				]
			}
		]
	});

	/*********
	** Page context
	*********/
	var state = {
		initialised: false,  // whether the page context has been initialised
		page: {
			number: null,   // the djvu page number extracted from the URL
			proofed: null
		},
		specialFormats: [] // work-specific header template formats
	};

	/*********
	** Private methods
	*********/
	/**
	 * Initialise the data needed by the page tools.
	 */
	var _initialise = function() {
		// only initialise once
		if(state.initialised)
			return;
		state.initialised = true;

		// get page metadata
		var pn = /\.djvu\/([0-9]+)&action=edit/g.exec(location.href);
		var pq = document.getElementById('pagequality');
		state.page = {
			number: pn !== null ? parseInt(pn[1], 10) : null,
			proofed: pq && pq.getAttribute('class') && pq.getAttribute('class').match(/quality0|quality[2-4]/)
		};

		// get user-defined work formats
		// expected format:
		//   {
		//      title: /History of England /,
		//      evenHeader: '{{rh|...}}',
		//      oddHeader: '{{rh|...}}',
		//      footer: '',
		//      footerWithReferences: '{{smallrefs}}'
	 	//   }
		state.specialFormats = [];
		if(window.specialFormats)
			state.specialFormats = state.specialFormats.concat(window.specialFormats);
	};
	
	/**
	 * Convert the text to title case based on English rules.
	 * @param {string} text The text to convert.
	 */
	var _titlecase = function(text) {
		// split text into individual words and examine them one by one
		var words = text.toLowerCase().split(" ");
		$.each(function(i, word) {
			switch(word) {
				case "a":
				case "an":
				case "and":
				case "as":
				case "at":
				case "but":
				case "by":
				case "etcetera":
				case "etc.":
				case "for":
				case "from":
				case "in":
				case "nor":
				case "of":
				case "o'":
				case "on":
				case "or":
				case "the":
				case "to":
				case "with":
				case "versus":
				case "vs.":
				case "v.":
				case "yet":
					break; // don't capitalise articles, "to" as part of an infinitive, prepositions or short conjunctions
				default: // capitalise everything else
					words[i] = word.substring(0, 1).toUpperCase() + word.substring(1, words[i].length);
					break;
			}
		});

		// capitalise first word regardless
		words[0] = words[0].substring(0, 1).toUpperCase() + words[0].substring(1, words[0].length);

		// capitalise last word regardless
		var last = words.length-1;
		words[last] = words[last].substring(0, 1).toUpperCase() + words[last].substring(1, words[last].length);

		// reconstruct title
		return words.join(' ');
	};

	/*********
	** Script methods
	*********/
	/**
	 * Add a {{running header}} template to the page.
	 * @param {object} editor The script helpers for the page.
	 */
	var addPageHeader = function(editor) {
		_initialise();
		
		if(state.page.number === null)
			return;

		var isEven = (state.page.number % 2 === 0);
		var generic = true;
		var headertext = '';
		
		for (var f in state.specialFormats) {
			var format = state.specialFormats[f];
			if (mw.config.get('wgTitle').match(format.title)) {
				headertext = isEven ? format.evenHeader : format.oddHeader;
				generic = false;
				break;
			}
		}

		// no special header matched, use a generic running header
		if (generic) {
			if (isEven)
				headertext = '{{running header|left=|center=}}'; // assume verso, with page number at left
			else
				headertext = '{{running header|center=|right=}}';
		}
		
		$('#wpHeaderTextbox').val(function(i, val) {
			return $.trim(val + '\n' + headertext);
		});

		// if this is unproofed text, then delete the first line of the OCR text, which presumably is raw OCR of the header we've just inserted
		if (!state.page.proofed) {
			var text = editor.get();
			editor.set(text.slice(text.indexOf('\n') + 1));
		}
	};

	/**
	 * Clean up OCR errors in the text, and push <noinclude> content at the top
	 * & bottom of the page into the header & footer boxes respectively.
	 * @param {object} editor The script helpers for the page.
	 */
	var pageCleanup = function(editor) {
		_initialise();
		
		// push <noinclude> content at the top & bottom into the header & footer
		if (editor.get().match(/^<noinclude\>/)) {
			var text = editor.get();
			var e = text.indexOf("</noinclude>");
			$('#wpHeaderTextbox').val(function(i, val) {
				return $.trim(val + "\n" + text.substr(11, e-11).replace(/^\s+|\s+$/g, ''));
			});
			editor.set(text.substr(e+12));
		}
		if (editor.get().match(/<\/noinclude\>$/)) {
			var text = editor.get();
			var s = text.lastIndexOf("<noinclude>");
			$('#wpFooterTextbox').val(function(i, val) {
				return $.trim(text.substr(s+11, text.length-s-11-12).replace(/^\s+|\s+$/g, '') + "\n" + val);
			});
			editor.set(text.substr(0, s));
		}
		
		// clean up text
		editor
			// remove trailing spaces at the end of each line
			.replace(/ +\n/g, '\n')

			// remove trailing whitespace preceding a hard line break
			.replace(/ +<br *\/?>/g, '<br />')

			// remove trailing whitespace and numerals at the end of page text
			// (numerals are nearly always page numbers in the footer)
			.replace(/[\s\d]+$/g, '')

			// remove trailing spaces at the end of refs
			.replace(/ +<\/ref>/g, '</ref>')
	
			// remove trailing spaces at the end of template calls
			.replace(/ +}}/g, '}}')
	
			// convert double-hyphen to mdash (avoiding breaking HTML comment syntax)
			.replace(/([^\!])--([^>])/g, '$1—$2')
	
			// remove spacing around mdash and hyphen, remove caret, replace 'modem' Ineuw modifications 2 
			.replace(/—/g, '— ') // mdash followed by space
			.replace(/—/g, ' —') // mdash preceeded by space
			.replace(/- /g, '-') // hyphen followed by space
			.replace(/ -/g, '-') // hyphen preceeded by space
			.replace(/\^/g, '')   // caret
			.replace(/modem/g, 'modern') // modem

			// join words that are hyphenated across a line break
			// (but leave "|-" table syntax alone)
			.replace(/([^\|])-\n/g, '$1');

		// clean up pages if they don't have <poem>
		if (!editor.contains('<poem>')) {
			editor
				// lines that start with " should probably be new lines,
				// if the previous line ends in punctuation,
				// other than a comma or semicolon
				// and let's get rid of trailing space while we're at it
				.replace(/([^\n\w,;])\n\" */g, '$1\n\n"')
	
				// lines that end with " should probably precede a new line,
				// unless preceded by a comma,
				// or unless the new line starts with a lower-case letter;
				// and let's get rid of preceding space while we're at it
				.replace(/([^,])\ *\"\n([^a-z\n])/g, '$1"\n\n$2')
	
				// remove single line breaks; preserve multiple.
				// but not if there's a tag, template or table syntax either side of the line break
				.replace(/([^>}\|\n])\n([^:#\*<{\|\n])/g, '$1 $2')
	
				// collapse sequences of spaces into a single space
				.replace(/  +/g, ' ');
		}
		
		// more page cleanup
		editor
			// dump spurious hard breaks at the end of paragraphs
			.replace(/<br *\/?>\n\n/g, '\n\n')

			// remove unwanted spaces around punctuation marks
			.replace(/ ([;:\?!,])/g, '$1')
	
			// unicodify
			.replace(/&mdash;/g, ' — ')
			// .replace(/&ndash;/g, '–')
			.replace(/&quot;/g, '"')
	
			// straighten quotes and apostrophes.
			.replace(/[“”]/g, '"')
			.replace(/[‘’`]/g, '\'')
	
			//OCR fixes
			// convert i9 to 19, etc.
			.replace(/[il]([0-9])/g, '1$1')
	
			// "the", "them", "their", etcetera
			.replace(/tlie/g, 'the')
	
			// "U" -> "ll" when preceded by a lowercase letter.
			.replace(/([a-z])U/g, '$1ll')
	
			// "would", "could"
			.replace(/woidd/g, 'would')
			.replace(/coidd/g, 'could')
			.replace(/shoidd/g, 'should')
	
			// many works have apostrophes missing from OCR
			.replace(/([a-z]) s\b/g, '$1\'s') // it's he's etc
			.replace(/n t\b/g, 'n\'t') //can't isn't didn't etc
			.replace(/([a-zI]) ll\b/g, '$1\'ll') // I'll we'll etc
			.replace(/\bI m\b/g, 'I\'m') // I'm
			.replace(/\b([Yy])ou re\b/g, '$1ou\'re') // you're
			.replace(/\b([Ww])e re\b/g, '$1e\'re') // we're
			.replace(/\b([Tt])hey re\b/g, '$1hey\'re') // they're
			.replace(/([a-zI]) ve\b/g, '$1\'ve') // I've we've etc
			.replace(/  /g, "  ") // replace two spaces with one

			// expand diacritical templates
			.replace(/{{((ae|oe|\w[:`'~^-]))}}/g, '{{subst'+':$1}}')

			// replace "float center" with "block center"; original template name was misleading enough be warrant routinely fixing
			.replace(/\{\{float center/g, '{{block center')

			.replace(/<center>\s*([.\n]*?)\s*<\/center>/g, '{{center|$1}}');

		/* move cursor to top of the page */	
		var input = document.getElementById("wpTextbox1");
			input.focus();
			input.setSelectionRange(0, 0);
			input.scrollTop = 0;
	};
	
	/**
	 * As you work your way through the page, when you encounter a reference, just mark it with <ref></ref> tags and continue.
	 * Once you've got to the end of the page and proofed the references, simply highlight each reference in turn,
	 * and use this function to move it to its proper position.
	 * @param {object} editor The script helpers for the page.
	 */
	var makeReference = function(editor) {
		_initialise();
		
		var editbox = $('#wpTextbox1').get(0);
		editbox.focus();
		var refStart = editbox.selectionStart;
		var refEnd = editbox.selectionEnd;

		var firstref = editbox.value.indexOf('<ref></ref>');
		if (firstref != -1) {
			editbox.value = editbox.value.slice(0,firstref+5)
			              + editbox.value.slice(refStart, refEnd)
			              + editbox.value.slice(firstref+5, refStart)
			              + editbox.value.slice(refEnd);
		}
	};

	/**
	 * Insert formatted references into the footer box if needed.
	 * @param {object} editor The script helpers for the page.
	 */
	var addPageFooter = function(editor) {
		_initialise();
		
		var editbox = $('#wpTextbox1').get(0);
		var footerbox = $('#wpFooterTextbox').get(0);
		var generic;
		var format;
		var f;
		if (editbox.value.indexOf("<ref>") == -1 && editbox.value.indexOf("{{#tag:ref") == -1) {
			// page contains no refs
			generic = true;
			for (f in state.specialFormats) {
				format = state.specialFormats[f];
				if (mw.config.get('wgTitle').contains(format.title)) {
					footerbox.value = format.footer;
					generic = false;
					break;
				}
			}

			// no special footer matched, use just strip out the references tag
			if (generic)
				footerbox.value = '';
		}
		else {
			generic = true;
			for (f in state.specialFormats) {
				format = state.specialFormats[f];
				if (mw.config.get('wgTitle').contains(format.title)) {
					footerbox.value = format.footerWithReferences;
					generic = false;
					break;
				}
			}

			// no special footer matched, so use a generic ref tag
			if (generic && doGeneric)
				footerbox.value = '{{block center|{{smallrefs}}}}';
		}
	};

	/**
	 * Mark the selected text with {{sc}}. If the text is uppercase, it will be converted to titlecase.
	 * @param {object} editor The script helpers for the page.
	 */
	var smallcaps = function(editor) {
		_initialise();
		
		editor.replaceSelection(function(text) {
			// Applying small-caps to all-caps text is pointless...
			// ... unless the all-caps is OCR of text that is actually small-caps.
			// Check if text is all-caps, and if it is, convert it to title case before applying small-caps.
			if (text == text.toUpperCase())
				text = _titlecase(text);
			
			return '{{sc|' + text + '}}';
		});
	};

	/**
	 * Convert the text to uppercase.
	 * @param {object} editor The script helpers for the page.
	 */
	var upper = function(editor) {
		_initialise();
		
		editor.replaceSelection(function(text) {
			return text.toUpperCase();
		});
	};
});

// </nowiki>