Note: After saving, changes may not occur immediately. Click here to learn how to bypass your browser's cache.
- Firefox / Safari: Hold Shift while clicking Reload, or press either Ctrl-F5 or Ctrl-R (Cmd-R on a Mac)
- Google Chrome: Press Ctrl-Shift-R (Cmd-Shift-R on a Mac)
- Internet Explorer: Hold Ctrl while clicking Refresh, or press Ctrl-F5
- Opera: Clear the cache in Tools → Preferences
For details and instructions about other browsers, see Wikipedia:Bypass your cache.
Code that you insert on this page could contain malicious content capable of compromising your account. If you are unsure whether code you are adding to this page is safe, you can ask at the central discussion page, Scriptorium. The code will be executed when previewing this page under some skins, including Monobook. You can in the interim if you wish to refresh the content sooner under another skin. |
This script seems to have a documentation page at User:Inductiveload/cleanup. |
/*
* OCR cleanup script
*
* Mostly a bunch of regexes and prayer
*/
/* eslint-disable camelcase, no-restricted-syntax */
( function ( $, mw ) {
'use strict';
const version = '0.1';
const signature = 'wsCleanup';
const DEBUG = 0;
const INFO = 1;
const ERROR = 2;
const Cleanup = {
logLevel: ERROR,
enable: true,
testFunctions: [],
enableTesting: mw.config.get( 'wgTitle' ).endsWith( 'cleanup-test' ),
portletCategory: 'page',
activeNamespaces: [ 'page' ],
actionTitle: 'WsCleanup',
additionalOcrReplacements: [],
disabledReplacements: [],
cleanupFunctions: [],
italicWords: [],
doLongSReplacements: false,
doTemplateCleanup: true,
remove_running_header: true,
replaceSmartQuotes: true,
collapseSuspiciousParagraphs: true,
shortLineThreshold: 45,
possibleLanguages: [ 'en' ], // 'fr', 'es', 'de', 'zh-pinyin' ],
italiciseForeign: true,
smallAbbreviations: [],
runningHeaderPatterns: [
/^([ivxlcIVLXC.,]+|[iI0-9.,]+)\s+([A-Z[\]\s^*\-–—.,]*)\s*$/,
/^([A-Z\s[\]^*\-–—.,]*)\s+([ivxlcIVLXC.,]+|[iI0-9.,]+)\s*$/,
/^\s*(\d+|[A-Z[\] ]+)\s*$/
],
smallAbbrTemplate: 'smaller',
editSummary: '/* Proofread */',
markProofread: true,
cleanupAccesskey: 'c'
};
function log( level, s ) {
if ( level >= Cleanup.logLevel ) {
// eslint-disable-next-line no-console
let log_fn = console.log;
if ( level >= ERROR ) {
// eslint-disable-next-line no-console
log_fn = console.error;
}
log_fn( 'Cleanup: ', s );
}
}
class CleanupProcessor {
constructor() {}
process( /* text */ ) {
throw new Error( 'Processors must implement process()' );
}
name() {
throw new Error( 'Processors must implement name()' );
}
}
function process_editor( editor, processor ) {
let text = editor.get();
log( INFO, `Processing editor with ${processor.name()}` );
text = processor.process( text );
editor.set( text );
}
class WholeWordRegexProcessor extends CleanupProcessor {
constructor( reps ) {
super();
this.reps = reps;
}
process( text ) {
log( DEBUG, `Making ${this.reps.length} replacements` );
for ( const v of this.reps ) {
const good = v[ 1 ];
const bad = v[ 0 ];
const re = new RegExp( '\\b' + bad + '\\b', 'g' );
text = text.replace( re, good );
}
return text;
}
name() {
return 'Generic whole word regexes';
}
}
function pageMayHaveLangs( deniedLangs ) {
const hasLangs = Cleanup.possibleLanguages.filter(
( value ) => deniedLangs.includes( value )
);
return hasLangs.length > 0;
}
class PartialWordRegexProcessor extends CleanupProcessor {
constructor( reps ) {
super();
this.reps = reps;
}
process( text ) {
log( DEBUG, `Making ${this.reps.length} replacements` );
this.reps.forEach( ( v, i ) => {
const options = v[ 2 ];
let skip = false;
Cleanup.disabledReplacements.forEach( ( dv ) => {
if ( dv[ 0 ].source === v[ 0 ].source ) {
// no repl - skip all, else only skip if repl also matches
if ( !dv[ 1 ] || dv[ 1 ] === v[ 1 ] ) {
skip = true;
}
}
} );
if ( skip ) {
log( DEBUG, `Skipped disabled replacement: ${v[ 0 ].source} -> ${v[ 1 ]}` );
return;
}
if ( options && options.notLangs ) {
if ( pageMayHaveLangs( options.notLangs ) ) {
log( DEBUG, `Skipped replacement with denied language: ${v[ 0 ].source} (due to ${options.notLangs})` );
return;
}
}
if ( options && options.onlyLangs ) {
if ( !pageMayHaveLangs( options.onlyLangs ) ) {
log( DEBUG, `Skipped replacement as no allowed language: ${v[ 0 ].source} (due to ${options.onlyLangs})` );
return;
}
}
try {
const newflags = 'g' + v[ 0 ].flags.replace( 'g', '' );
// \b doesn't match useful things like unicode, so fix that up
// this can't do everything but it might help
const newSource = v[ 0 ].source;
// \b at the the start - replace with non-consuming space-or-start
// .replace( /^\\b/, '(?<=^|[\\s\\-;:\'",.!?–—{}\\[]\\|])' );
text = text.replace( new RegExp( newSource, newflags ), v[ 1 ] );
} catch ( error ) {
log( ERROR, `Error in ${i}th replacement: ${v}` );
throw error;
}
} );
return text;
}
name() {
return 'Generic partial word regexes';
}
}
/**
* Make replacements for things that cannot be a suffix in a word, but instead
* must be a new word (i.e. a space has gone missing _before_ the match)
*/
class BannedSuffixProcessor extends CleanupProcessor {
constructor( suffix_list ) {
super();
this.suffix_list = suffix_list;
}
process( text ) {
for ( const v of this.suffix_list ) {
const newflags = 'g' + v.flags.replace( 'g', '' );
const regex = new RegExp( '(\\w+)(' + v.source + ')', newflags );
text = text.replace( regex, '$1 $2' );
}
return text;
}
name() {
return 'Banned suffixes';
}
}
/**
* Make replacements for things that cannot be a prefix in a word, but instead
* must be a previous word (i.e. a space has gone missing _after_ the match)
*/
class BannedPrefixProcessor extends CleanupProcessor {
constructor( prefix_list ) {
super();
this.prefix_list = prefix_list;
}
process( text ) {
for ( const v of this.prefix_list ) {
const newflags = 'g' + v.flags.replace( 'g', '' );
text = text.replace( new RegExp( '(' + v.source + ')(\\w+)', newflags ), '$1 $2' );
}
return text;
}
name() {
return 'Banned prefixes';
}
}
/**
* Make replacements for words that cannot stand alone, but would most likely be
* suffixes of previous words (i.e. a space has been inserted _before_ the match)
*/
class OrphanSuffixProcessor extends CleanupProcessor {
constructor( reps ) {
super();
this.reps = reps;
}
process( text ) {
for ( const v of this.reps ) {
const newflags = 'g' + v.flags.replace( 'g', '' );
text = text.replace( new RegExp( '[\\s\\-](' + v.source + '\\b)', newflags ), '$1' );
}
return text;
}
name() {
return 'Orphan suffixes';
}
}
/**
* Make replacements for words that cannot stand alone, but would most likely be
* prefixes of following words (i.e. a space has been inserted _afteR_ the match)
*/
class OrphanPrefixProcessor extends CleanupProcessor {
constructor( reps ) {
super();
this.reps = reps;
}
process( text ) {
for ( const v of this.reps ) {
const newflags = 'gi' + v.flags.replace( /[gi]/, '' );
text = text.replace( new RegExp( '(\\b' + v.source + ')[\\s\\-]', newflags ), '$1' );
}
return text;
}
name() {
return 'Orphan prefixes';
}
}
/**
* Wrap selected matches in italics
*/
class ItaliciseProcessor extends CleanupProcessor {
constructor( reps ) {
super();
this.reps = reps;
}
process( text ) {
for ( const v of this.reps ) {
const newflags = 'g' + v.flags.replace( /[gi]/, '' );
text = text.replace( new RegExp( '(?<!\'\')(' + v.source + ')', newflags ), "''$1''" );
}
return text;
}
name() {
return 'Italics';
}
}
/*
* These functions need the original line breaks
*/
const do_pre_collapse_cleanup = function ( editor ) {
const reps = [
// remove trailing spaces at the end of each line
[ / +\n/, '\n' ],
// treat these symbols as hyphens
[ /[⌐¬]/, '-' ],
// join words that are hyphenated across a line break
// (but leave "|-" table syntax alone)
// Capitals keep their hyphen e.g. non-European
[ /([^|])-\n(?=[ÁÀA-ZÉÈÖ])/, '$1-' ],
// everything else loses the hyphen
[ /([^|])-\n(?=[\w])/, '$1' ]
];
process_editor( editor, new PartialWordRegexProcessor( reps ) );
};
class RunningHeaderProcessor extends CleanupProcessor {
constructor( rh_patterns ) {
super();
this.rh_patterns = rh_patterns;
}
name() {
return 'Trim running header patterns';
}
process( text ) {
text = text.split( /\r?\n/ );
let new_start_line = 0;
for ( const line of text ) {
if ( line.trim().length === 0 ) {
new_start_line += 1;
continue;
}
let found = false;
for ( const pattern of this.rh_patterns ) {
if ( pattern.test( line ) ) {
new_start_line += 1;
found = true;
break;
}
}
if ( !found ) {
break;
}
}
return text.slice( new_start_line ).join( '\n' );
}
}
const do_generic_cleanup = function ( editor ) {
// various cleanup
const reps = [
// Digitized by Google (kill)
[ /\s?D[ijl]g[ijl]t[ijl][sz][eco]d\s+by[^\n]*\s+([6G][Oo0Q]{2}g[lIf][eco])?/, '' ],
[ /\bG[oO0]{2}gle\b/, '' ],
// Remove highly suspicious chars
[ /[■•]/, '' ],
// remove trailing whitespace preceding a hard line break
[ / +<br *\/?>/, '<br />' ],
// remove trailing whitespace at the end of page text
[ /\s+$/, '' ],
// remove trailing spaces at the end of refs
[ / +<\/ref>/, '</ref>' ],
// remove trailing spaces at the end of template calls
[ / +}}/, '}}' ],
// lines containing only punctuation are likely junk
[ /^[.,^]$/m, '' ],
// convert double-hyphen to mdash (avoiding breaking HTML comment syntax)
[ /([^!])--([^>])/, '$1—$2' ],
// Remove spaces around hyphens between words
// Eg. pack -house -> pack-house
[ /(\w) ?- ?(\w)/, '$1-$2' ],
// remove unwanted spaces before punctuation marks
[ / ([);:?!,.])/, '$1' ],
// ensure spaces after punctuation marks
[ /([);:?!,.])([^ 0-9\n}|"'’”])/, '$1 $2' ],
// ...but double punctuation doesn't get any spaces
[ /([);:?!,.]) +([\n);:?!,.\]]|$)/, '$1$2' ],
// Double full-stop is probably just (3 or 4 is OK - ellipsis)
[ /(\w)\.\. (?=\w)/, '$1. ' ],
// no spaces for inter-numeric punctuation
[ /([0-9][,]) +([0-9]{3}(?![0-9]))/, '$1$2' ],
// quotes at start of line can't be a close
[ /^(['"]) (?=[A-Za-z])/m, '$1' ],
// quotes at end of line can't be an open
[ / (['"])$/m, '$1' ],
// no space in "'s"
[ / ?' ?s([\n ])/, '\'s$1' ],
[ /\( +/, '(' ],
[ / +\)/, ')' ],
[ / *— */, '—' ],
// Date ranges
[ /([0-9]{3,4})-([0-9]{2,4})/, '$1–$2' ],
// figures
[ / ?, ?ooo/, ',000' ],
// q.v. to q. v.
[ /q\.v\./, 'q. v.' ],
// i.e.
[ /\bi\.? ?e\.(?!')/, "''i.e.''" ],
// & c. to &c.
[ / ?& ?[coe][.,]([,]?)/, ' &c.$1' ],
// this is an old pound noation
// with a slash after a space
[ /([0-9]) ?[/]\.(?=\s)/, "$1''l.''" ],
// No spaces between num and st/nd/rd
[ /([0-9]) (st|nd|rd)\b/, '$1$2' ],
[ /ty(one|two|three|four|five|six|seven|eight|nine|first|second|third|fourth|fifth|sixth|seventh|eighth|ninth)/, 'ty-$1' ],
// fi ligature to fi
[ /fi/, 'fi' ],
[ /ſ/, 'f' ],
[ /_/, ' ' ]
];
process_editor( editor, new PartialWordRegexProcessor( reps ) );
};
const do_ocr_fixes = function ( editor ) {
const reps = [
// some apostrophes probably bogus at word start
[ /\b([vw])'([a-z])/, '$1$2' ],
// some mis-read full-stops
[ /\b(?<=Mr|Mrs|Mssrs|Ms)'/, '.' ],
// ^ -> '' : delete spurious carets
[ /(?<=w)\^/, '' ],
// ! -> l
[ /ua!(?=\s)/, 'ual' ],
// / -> f
[ /\/ellow/, 'fellow' ],
// / -> t
[ /(\s)\/he\b/, '$1the' ],
// £ -> f
[ /£f\b/, 'ff' ],
// « -> s
[ /(?<=\w)«(?=\s)/, 's' ],
// $ -> s
[ /(?<=[a-z])\$/, 's' ],
// }' -> y
[ /r}'/, 'ry' ],
// ' -> y
[ /(?<=\b[Vv]er)'/, 'ery' ],
[ />(?=['"])/, '?' ],
// } -> ?
[ /(?<=[a-z]) }/, '?' ],
[ /\('(?=yc)/, 'C' ],
// 'I' -> T
[ /(?<=\W)'[IJ]'(?=\w)/, 'T' ],
// 0 -> O
[ /\b0[*']([BNR])/, "O'$1" ], // Irish names
// 1 -> i
[ /(?<=\. )1(?=n|s|t)/, 'I' ],
[ /1(?=n|s|t)/, 'i' ], // hard to tell In or in
// avoid units, dates, and "1 of", "1 to" and "1 in"
[ / 1 (?![0-9A-Z]|(or|to|in|of)\b|inch|mi\b|mile|ft|foot|cm|cent(i|\b)|dollar|pound|yard|metr|mm|km|kilo|acre|hect[ao])/, ' I ' ],
// 4 -> d
[ /4oor/, 'door' ],
[ /e4\b/, 'ed' ],
// 6 -> o
[ /\b6(?=[a-z])/, 'o' ], // 6n, 6f, etc
// 8 -> S
[ /\b8(?=\w|\b)/, 'S' ], // 8o, etc, but not 8o00
// 8i -> th
[ /\b8i/, 'th' ],
// a -> e
[ /(?<=[Jj]udg)a/, 'e' ],
// a -> f
[ /\baf\b/, 'of' ],
// a -> n
[ /\baad/, 'and' ],
[ /upoa/, 'upon' ],
[ /\bia\b/, 'in' ],
[ /(?<=[Rr])emaia/, 'emain' ],
// a -> s
[ /riaon/, 'rison' ],
[ /wera\b/, 'wers' ],
[ /\beap/, 'esp' ],
// AA -> w
[ /\b(AA|AV)(?=[a-z]{2})/, 'w$1' ],
[ /\bnat\b/, 'not' ],
// ae -> nc
[ /aaee(|s|d)\b/, 'ance$1' ],
// Av -> w
[ /Av(ill|ith|ere\b|est|here|hat|as\b|ould|ho|or|hich|hen|ell|eigh|ise|eak|rit|ron)/,
'w$1' ],
[ /AV(ill|ith|ere\b|est|here|hat|as\b|ould|ho|or|hich|hen|ell|eigh|ise|eak|rit|ron)/,
'W$1' ],
[ /(?<=[a-z])AV\b/, 'w' ],
// Avli -> wh
[ /\bAvli(ich|om?(ever)?|en|ere|ether|y)\b/, 'wh$1' ],
[ /\bAVli(ich|om?(ever)?|en|ere|ether|y)\b/, 'Wh$1' ],
// b -> e
[ /\b([Tt])hb/, '$1he' ],
// b -> h
[ /\bbow(so|ever|itz|beit)/, 'how$1' ], // watch for bowl...
[ /\b(?<=[Tt])be(?=y\b|a\b|se\b|ir\b)/, 'he$1' ],
[ /\b(?<=[Ww])b(?=i|e)/, 'h' ], // which, when
[ /\bbas(|n't|ten)\b/, 'has$1' ],
[ /\bber(|self|eto)\b/, 'her$1' ],
[ /\bbim(|self)\b/, 'him$1' ],
[ /([Ww])hicb/, '$1hich' ],
[ /\b([Ss])bow/, '$1how' ],
// b -> o
[ /(?<=\b[Ss])b/, 'o' ],
// b -> r
[ /mbeb\b/, 'mber' ],
[ /dmibal/, 'dmiral' ],
[ /xtba/, 'xtra' ],
[ /Victobia/, 'Victoria' ],
// B -> E
[ /\b(?<=TH|THR)B/, 'E' ],
// B -> R
[ /Bailw/, 'Railw' ],
[ /Boyal/, 'Royal' ],
[ /\bFBO/, 'FRO' ],
// c -> e
[ /cx(?![ivxcdm]+\b)/, '$1ex' ], // mind roman numerals
[ /becn/, 'been' ],
[ /\bbcen/, 'been' ],
[ /(C|c)lcar/, '$1lear' ],
[ /(a|u|o|p)pces\b/, '$1pees' ], // rupees,...
[ /(C|c)asc(\b|(?=\w)[^a])/, '$1ase$2' ],
[ /\bwc\b/, 'we' ],
[ /(?<=[Ss]t|\b[Tt])cam/, 'eam' ],
[ /(S|s)evc/, '$1eve' ], // several/severe
[ /([Gg])rcat/, '$1reat' ],
[ /([fvh])crence/, '$1erence' ],
[ /\b(?<=[Hh])c\b/, 'e' ], // hc -> he
[ /\bcn(?!i)/, 'en' ],
[ /\bmcn\b/, 'men' ],
[ /((?=\w)[^ao]|\b)rcs/, '$1res' ], // avoid arcs/orcs
[ /\Borcs\b/, 'ores' ], // but it can be a suffix of ores
[ /\bpcople(|s)\b/, 'people$1' ],
[ /\b&e\.(?=\s|$)/, '&c.' ],
[ /catc(|d)\b/, 'cate$1' ],
[ /\bcight/, 'eight' ],
[ /nccessar/, 'necessar' ],
[ /\b([Ww])cr/, '$1er' ],
[ /([^Aaeou])rcat/, '$1reat' ],
[ /\b([Oo])nc(|s)\b/, '$1ne$2' ],
[ /(?<=\b[Ss])[ec][ec](?=m|ing)/, 'ee' ], // seem, seeing
[ /(?<=g)mics\b/, 'mies' ],
[ /(?<=\b[\Ss])tr[ce][ce]ct/, 'treet' ], // street
[ /ocict/, 'ociet' ], // society
[ /cither/, 'either' ], // cither exists, but...
[ /(?<=\b[Ss])[ce][ce](?=d|\b|ing|m)/, 'ee' ], // see, seed, seeing
[ /(?<=\b[Ss])c(?=er|ct)/, 'e' ], // seer... (not secretary)
[ /(?<![ln])icf/, 'ief' ], // grief
[ /c(?=ver|lectr)/, 'e' ], // ever, every, electric
[ /(?<=[Pp])copl[ce]/, 'eople' ], // people
[ /(?<=[Gg]rac|[Rr]os)c/, 'e' ], // grace, rose
[ /(?<=[Cc]ru|[Yy]i)cl/, 'el' ], // cruel, yield, etc
[ /cl(?=\b|l|f)/, 'el' ], // inc. scfl -> self
[ /ncral(?=(?:s|ly|ity|ities)\b)/, 'neral' ], // general-
[ /cth(?!ood|eroy|roat|yma|esis|etic|lip|idro|i\b)/, 'eth' ], // maketh, etc
[ /tcd\b/, 'ted' ],
[ /\b(t|tsz|sz)c\b/, '$1e' ], // chinese
// ce -> œ
[ /(?<=[Mm]an)ce(?=u)/, 'œ' ],
// ci -> d
[ /(P|p)rociu/, '$1rodu' ],
[ /\bacidition(s|)\b/, 'addition$1' ],
// ci -> ici
[ /offci/, 'offici' ],
// cnce: ence
[ /cnce\b/, 'ence' ],
[ /clves\b/, 'elves' ],
// bom to born
[ /\bbom\b/, 'born' ],
// c -> d
[ /aciva/, 'adva' ], // advantag...
// c -> g
[ /(\B[^\bzlp])inc\b/, '$1ing' ],
// c -> o
[ /\bcwn/, 'own' ],
[ /cc(?=ln|ld|mp|lum|n|resp|s)/, 'co' ], // Lincoln, cold, company, ...
[ /\bcc(?=urt)/, 'co' ], // court, not accurtation
[ /\bcught/, 'ought' ],
// c -> s
[ /\b([dD])icre/, '$1isre' ], // disregard
// ci -> d
[ /eci\b/, 'ed' ],
// d -> i
[ /\bwdth/, 'with' ],
// d -> o
[ /d(?=mp|wn)/, 'o' ], // eg. compose, town
[ /fdr/, 'for' ],
// dl -> 31
[ /\b[Sd3]lst\b/, '31st' ],
// e -> a
[ /\bscele(|s|d)\b/, 'scale$1' ],
// e -> c
[ /\be(ome)\b/, 'c$1' ],
[ /rcet/, 'rect' ], // direct...
[ /struet/, 'struct' ],
[ /enee\b/, 'ence' ],
[ /expeet/, 'expect' ],
[ /((?=\B)[^n]|[oi]n)speet/, 'spect' ], // avoid speet and Nunspeet
[ /taeh/, 'tach' ], // detach
[ /\bwhieh(|ever)\b/, 'which$1' ],
[ /\bfec\b/, 'fee' ],
[ /execpt/, 'except' ],
[ /([^q])uet(ing|ed)\b/, '$1ucted' ], // conducted
[ /&e\./, '&c.' ],
[ /(?<=[Uu]n)ele(?=s?\b)/, 'cle' ],
// é -> è
[ /ére\b/, 'ère' ], // No words end with acute-e ére
// E -> F
[ /E(rom )/, 'F$1' ],
// e -> o
[ /\bef\b/, 'of' ],
[ /\bfrem\b/, 'from' ],
[ /\bse\b/, 'so', { notLangs: [ 'es', 'fr', 'zh-pinyin' ] } ],
// e -> r
[ /rthee(?!ls)/, 'rther' ], // further, northern
[ /outhee(?!ls|l\b)/, 'outher' ], // southern/ly
[ /([^r])eoad/, '$1road' ], // broad
// e -> s
[ /\beo(|uth)\b/, 'so' ],
[ /\bthoee\b/, 'those' ],
// el -> d
[ /\belyn/, 'dyn' ],
[ /itel\b/, 'ited' ], // cited, united,...
// -eney -> -ency (sad for Sweeny Todd)
[ /eney\b/, 'ency' ],
// er -> ev
[ /\berery/, 'every' ],
// é -> c
[ /([aeiou])é(t)/, '$1c$2' ],
// f -> nothing
[ /\bhighfer/, 'higher' ],
// f -> i
[ /anfes\b/, 'anies' ],
[ /stfan/, 'stian' ],
// f -> l
[ /(?<=[Aa])farm/, 'larm' ],
// f -> t
[ /\b(|in)difterent/, 'different' ],
[ /\bfwo/, 'two' ],
// f -> r
[ /(?<=\bB)[ft]it(?=ish|ain)/, 'rit' ],
// ff -> fl
[ /\bff(ood)\b/, 'fl$1' ],
// ff -> ñ
[ /(?<=[Ss])paf[ifl]a\b/, 'paña' ],
// g -> ç
[ /(?<=Mendon?)ga\b/, 'ça' ],
[ /(?<=Gu?on?)g(?=all?o)\b/, 'ç' ],
[ /Lorengo/, 'Lorenço' ],
// G -> 6
[ /\bG([0-9]*)th\b/, '6$1th' ],
// h -> b
[ /([Dd])ouht/, '$1oubt' ],
[ /\bhe(en)\b/, 'be$1' ],
[ /(Oo])hser/, '$1bser' ], // observe
[ /\bhio/, 'bio' ],
[ /\bemh/, 'emb' ],
[ /\bheyo/, 'beyo' ],
[ /\bohs\B/, 'obs' ],
[ /\bhy\b/, 'by' ],
[ /\bhe(?=ings?|en\b|an\b)/, 'be' ],
[ /\bhene(?!icos|n|q)/, 'bene' ],
// h -> c
[ /\bhareful(|ly)/, 'careful$1' ],
// h -> im
[ /\bh(?=nony|nonies)\b/, 'im' ],
// h/U -> li
[ /\b(h|U)(fe|ke|ttle)\b/, 'li$2' ],
[ /nghs([ht])/, 'nglis$1' ], // English, etc
// h -> n
[ /\bih(?![ilr])/, 'in' ],
[ /lahd(?='?s?\b|ing'?s?\b)/, 'land' ],
// h -> li
[ /\bhv[ec](?=s|)\b/, 'live' ],
[ /(?=\b[Aa])hve\b/, 'live' ],
[ /hng(?=s|ly)?\b/, 'ling' ],
[ /dehc/, 'delic' ], // delicate, etc
// h -> lt
[ /cuh(?=(|y)\b)/, 'cult' ], // difficult(y), etc
// H -> li
[ /\bHke/, 'like' ],
// H -> ll
[ /(?<=\bA|[a-z])H/, 'll' ],
// hv -> lw
[ /(?<=[Aa]|ai|l)hvay/, 'lway' ], // always, railway, spillway
// convert i9 to 19, etc.
[ /[il]([0-9])/, '1$1' ],
// i -> 1
[ /\b[Il][Iil]th\b/, '11th' ],
[ /(?<=[0-9])ist\b/, '1st' ],
// I -> 1
[ /\bIst\b/, '1st', { notLangs: [ 'de' ] } ],
// i -> nothing
[ /\bsomie/, 'some' ],
[ /sielf/, 'self' ],
[ /\b([Tt])hi(ey|ese)\b/, '$1h$2' ],
[ /senise/, 'sense' ],
[ /(?<=[Ff])irom/, 'rom' ],
// I -> nothing
// See also T -> nothing
// i -> a
[ /\bnime(ed|ly)/, 'namely' ],
// i -> f
[ /\bior(\b|m)/, 'for$1' ],
[ /(I|i)nior/, '$1nfor' ],
[ /([^m])afi(a|o)/, '$1aff$2' ],
[ /\ba[ií]f/, 'aff' ],
[ /([rhlf])iei(s|ly|)\b/, '$1ief$2' ], // brief
// i -> j
[ /(in|b|con|de|a)iect/, '$1ject' ],
[ /\biett(y|ies)/, 'jett$1' ],
// i -> l
[ /([a-z])abie\b/, '$1able' ],
[ /ficuit(|y)/, 'ficult$1' ],
[ /enerai/, 'eneral' ],
[ /\biab(o|ou)r/, 'lab$1r' ],
[ /cicar/, 'clear' ],
[ /shali(\b|ow)/, 'shall$1' ],
[ /(i)abie\b/, '$1able' ], // reliable, ...
[ /reiig/, 'relig' ],
[ /([aeiou])riy\b/, '$1rly' ],
[ /\b(un|)iaw/, '$1law' ],
[ /\bgloi(y|ious)/, 'glor$1' ],
[ /tiy\b/, 'tly' ],
[ /iais\b/, 'ials' ], // materials...
[ /\b(Ii)li(s?\b|ness)/, '$1ll$2' ],
[ /(?<=[Ss]e)if/, 'lf' ], // self
// -isli -> -ish
[ /(\w)isli\b/, '$1ish' ],
// i -> r
[ /eiy(?![ua])/, 'ery' ],
[ /([Ff])iist/, '$1irst' ],
[ /([Gg])ieat/, '$1reat' ],
[ /\b([Pp])oit(?![ior])/, '$ort' ], // port/ion
[ /beied\b/, 'bered' ],
// i -> t
[ /(a|o|i)iion/, '$1tion' ],
[ /leci\b/, 'lect' ],
[ /aier/, 'ater' ], // material
[ /\bmulii/, 'multi' ],
[ /\bihe/, 'the' ], // the, there...
[ /nir(ies|y)/, 'ntr$1' ], // country
[ /\bio(|wards?|gether)\b/, 'to$1' ],
[ /\bihat\b/, 'that' ],
[ /enily\b/, 'ently' ],
[ /ciion/, 'ction' ],
[ /(?<=[Bb]u)i/, 't' ],
[ /Stewari/, 'Stewart' ],
// i' in a word -> r (not 's)
[ /(?<=[a-z])i'(?=[a-rt-z]|s\w)/, 'r' ],
// i^ > r
[ /(?<=[a-z])i\^/, 'r' ],
// i- -> r (be more careful than ^, - can be right)
[ /(?<=Yo)i-/, 'r' ],
// I -> f
[ /\bIor([^gim]|\b)/, 'for$1' ],
// I -> l
[ /\b[l1I]' ?(?=[AEIOUÉÈaeiouéè]\w)/, 'l\'' ],
// I' at word start -> f (except I'd. I'm, I'll, etc)
[ /\bI'([a-ce-kn-uw-z])/, 'f$1' ],
// I- -> L
[ /\bI-ord/, 'Lord' ],
// I^ -> P
[ /\bI\^/, 'P' ],
// id -> nl
[ /\boidy/, 'only' ],
// id -> ul
[ /\bshoidd/, 'should' ],
// if -> i
[ /(?<=\b[Oo])if\b/, 'f' ],
// If -> N (happens in cap'd words)
[ /\b([A-Z]+)If\b/, '$1N' ],
// ii -> a
[ /\biind\b/, 'and' ],
[ /\biimount/, 'amount' ],
// II -> H
[ /\bII(e|[a-z]{2,})\b/, 'H$1' ],
// ii -> h
[ /tiie/, 'the' ],
[ /hicii/, 'hich' ], // which
// II -> M
[ /II(?=r|s)/, 'M' ],
// ii -> n
[ /aiis(?!m)/, 'ans' ],
[ /co(?:ii|tt)c/, 'conc' ],
// ii -> u
[ /(?<=\b[SsBbMm])ii/, 'u' ],
[ /\bii(?!\b|i)/, 'u' ], // avoid roman nums iii
[ /iiim(?=s?\b)/, 'ium' ],
[ /(?<=[Yy])oii/, 'ou' ],
// ii -> ü
[ /(?<=\bHs?)iian\b/, 'üan' ],
[ /\bMiiller/, 'Müller' ],
[ /\bYii(?=n\b|an\b)/, 'Yü' ],
[ /\bTriib/, 'Trüb' ],
// -iiig -> -ing
[ /iiig\b/, 'ing' ],
// ij -> h
[ /tija(?!j)/, 'tha' ],
[ /([Tt])ij([ae])/, '$1h$2' ],
// il -> H
[ /(\W |\n)il(e|im|er)/, '$1 H$2' ],
// Il -> H
[ /\bIlo(?![ck]no|ilo|ko|na|ne\b|ngot|nka|rin|ts?\b|tycin|well)/, 'Ho' ],
// in -> m
[ /soine/, 'some' ],
[ /inod(er|[^e])/, 'mod$1' ], // avoid ..node...
[ /ninent/, 'nment' ], // government/s
[ /\bcomin([au])/, 'commu$1' ], // community, communication, command
[ /\biny(|self)\b/, 'my$1' ],
[ /\binen\b/, 'men' ],
[ /([^mst])inent/, '$1ment' ], // document...
[ /(to|for|by|with|told|tell|let|g[ia]ve|from|towards|[oui]nto|under) ine\b/, '$1 me' ], // ine could be a suffix, so hit the common ones by ngram
[ /\bimined/, 'immed' ],
[ /\binean(|s)\b/, 'means' ],
[ /\bMohainn/, 'Mohamm' ],
[ /sinug/, 'smug' ],
[ /inforin/, 'inform' ],
[ /\bhiin(self|)\b/, 'him$1' ],
[ /\b([Ee])nin(i|e)/, '$1nm$2' ], // enmity, enmesh..
[ /\b([Ff])roin\b/, '$1rom' ],
[ /([Mm])einb/, '$1emb' ],
// in -> th
[ /(?<=(?:[Ii]n|[Tt]o|[Ff]or) )ine(?=\b|re\b|se\b|ir\b)/, 'the' ],
// io -> w
[ /\bneio(|ly)\b/, 'new$1' ],
// ir -> n
[ /\biir/, 'in' ],
// it -> n
[ /meitt/, 'ment' ],
// iv -> j
[ /\biv(?=st\b)/, 'ju' ],
// iv -> w
[ /\bneiv(|ly)\b/, 'new$1' ],
[ /tiveen/, 'tween' ],
// IVI -> M
[ /\bIVI(?=[a-z])/, 'M' ],
// j -> f
[ /\boj\b/, 'of' ],
// j -> i
[ /thjs/, 'this' ],
// J -> I
[ /\bJowa/, 'Iowa' ],
// J -> G
[ /\b\(J(?=uide)/, 'G' ],
// J -> l
[ /\bJibert/, 'libert' ],
[ /\b(?<=[Bb])jood/, 'lood' ], // blood
[ /ojher/, 'other' ],
// j -> y
[ /ojal/, 'oyal' ],
[ /\b([Mm])anj\b/, '$1any' ],
[ /\b([Tt])hej\b/, '$1hey' ],
// Ji -> h
[ /Jiave/, 'have' ],
[ /tJie/, 'the' ],
// jl -> d
[ /arjl/, 'ard' ],
// jj -> g
[ /jjht/, 'ght' ],
// j}3^ -> y
[ /(3|j|\})\^/, 'y' ],
// k -> ic
[ /whkh/, 'which' ],
// kl -> d
[ /Eklinb/, 'Edinb' ],
// K -> E
[ /Kng/, 'Eng' ],
// l -> nothing
[ /\b(|in)diflferent/, '$1different' ],
[ /\beitlher\b/, 'either' ],
[ /eaclh/, 'each' ],
[ /Clhin(a|ese)/, 'Chin$1' ],
[ /(?<=[Ff]l|[Dd]r|ang|[Qq]|iq|)uild/, 'uid' ], // fluid etc
[ /(?<=\b[Tt])(?:lh|hl|jh|hj)(?=[ieo])/, 'h' ], // the, these, those, etc
// l -> d
[ /listor/, 'distor' ], // distort...
// l -> f
[ /\bol\b/, 'of' ],
[ /\bl(orm)\b/, 'f$1' ],
// l -> i
[ /fui(\b|ness\b)/, 'ful$1' ],
[ /(d|D)ipio/, '$1iplo' ],
[ /(P|p)arll/, '$1arli' ],
[ /\bWilllam/, 'William' ],
[ /\b([Ff])lc/, '$1ic' ], // fiction
[ /\b(Tt])helr/, '$1heir' ],
[ /(?<=[Rr]|[Vv]|[Dd]|[Tt]|[g]|[Ff]|[Mm])ellc/, 'elic' ], // relic, delicate,
// l -> I
[ /"\blon(a|ian)/, 'Ion$1' ],
[ /\bl'(ve|ll)\b/, "I'$1" ],
[ /\blt('?s|self)\b/, 'it$1' ],
// l -> h
[ /(a|o)rslip/, '$1rship' ], // scholarship, warships, worship
[ /\b([Ww])hicl/, 'which' ],
[ /(\w)encl\b/, 'ench' ], // french, bench...
// l ->li
[ /\blke/, 'like' ],
// l -> t
[ /([0-9])lh\b/, '$1th' ],
[ /\boul/, 'out' ],
[ /([Aa])fler/, '$1fter' ],
[ /ifl(?=\b|ness|ly)/, 'ift' ], // swift
// la -> h
[ /\bthrougla/, 'through' ],
[ /\btla(?<!c)/, 'th' ],
// li -> b
[ /\blio([^n])/, 'bio$1' ], // not lion...
[ /liject/, 'bject' ], // subject
// li -> lh
[ /\botlier(|s|wise)/, 'others' ],
[ /\b([Mm])onarcli(|s|y)/, '$1onarch$2' ],
// lT -> ff
[ /di(lT|flP)ere/, 'differe' ],
// l) -> b
[ /al\) ?le\b/, 'able' ],
// l^ -> f
[ /l\^(?=[a-z])/, 'f' ],
// li -> b
[ /\bliy\b/, 'by' ],
// li -> h ... "the", "them", "their", "with", "much", "here" and whe etcetera
[ /([tT][Jl]i)(e|at|is|an|em|ear|eir|en|ither|ose|rough|ree)\b/i, 'th$2' ],
[ /\b([SsWw])lie/, '$1he' ], // she, when...
[ /\b([Ww])li(at|ole)/, '$1h$2' ], // what, whole
[ /(wlicli|ivhic(li|h)|wliich|wiiich|whicli)/, 'which' ],
[ /liurcli/, 'hurch' ],
[ /\bli(ave|ere|is|ad|ard)/, 'h$1' ],
[ /\bIl(is)\b/, 'H$1' ],
[ /witli/, 'with' ],
[ /mucli\b/, 'much ' ],
[ /\blias/, ' has' ],
[ /\bwlio/, 'who' ],
[ /\b(an|)otlier\b/, '$1other' ],
[ /ealtli/, 'ealth' ],
[ /([Cc])lii/, '$1hi' ], // China/ese...
[ /([SsMu]ucli)/, '$1uch' ],
[ /cliann/, 'chann' ],
[ /ubhs/, 'ublis' ], // publish
[ /\bliate/, 'hate' ],
[ /liion/, 'hion' ], // fashion
[ /(?<=[Tt])liing/, 'hing' ], // thing
[ /(?<=[Nn]e|[Ee])itlier/, 'ither' ], // either, neither
[ /(?<=[Cc]|\b)liarm/, 'harm' ],
// li -> k
[ /([LlBb])ooli(\b|s)/, '$1ook\b' ],
// llt -> th
[ /\bllt(e)\b/, 'th$1' ],
// lli -> th
[ /\blli(at|e)\b/, 'th$1' ],
// ln -> b
[ /suln/, 'sub' ],
[ /([Hh])md/, '$1ind' ],
// lu -> hi
[ /(?<=[a-z][^li])lucal/, 'hical' ], // -graphical
// m -> in
[ /mg\b/, 'ing' ],
[ /\bopm/, 'opin' ],
[ /Chm(a|ese)/, 'Chin$1' ],
[ /(?<=\b[Pp]la)m/, 'in' ],
// m -> n
[ /\bFramce/, 'France' ],
[ /\bFremch/, 'French' ],
[ /\bJume\b/, 'June' ],
// m -> on
[ /atim\b/, 'ation' ],
[ /\b(V|v)erbation\b/, '$1erbatim' ], // fix verbatim
// m -> rn
[ /ceming\b/, 'cerning' ],
[ /\b([Un]w|[Ww])om\b/, '$1orn' ],
[ /(?<=[Nn]orth|[Ss]outh|[Ee]ast|[Ww]est)em\b/, 'ern' ],
[ /(?<=B[ij[oö])m\b/, 'rn' ],
[ /Foumier/, 'Fournier' ],
// m -> un
[ /\bmorth/, 'unorth' ],
// m -> w
[ /\b([Nn])em([^aeo]|\b)/, '$1ew$2' ], // new, newly, news
// mn -> nm
[ /mnent/, 'nment' ],
// mu -> nm
[ /\bumu(?=[aeiou])/, 'unm' ],
// M -> N
[ /\bNongol/, 'Mongol' ],
// n -> a
[ /(G|g)rent/, '$1reat' ],
[ /\bns/, 'as' ],
[ /ncknow/, 'acknow' ],
// n -> h
[ /\btn(e|a)/, 'th$1' ],
[ /\bwn/, 'wh' ],
[ /([Ss])mitn/, '$1mith' ],
// n -> in
[ /(?<=[^Eaeiou])ng\b/, 'ing' ], // -ing
// n -> m
[ /(?<=I|i)nperi/, 'mperi' ], // imperial
[ /(?<=H|h)inse/, 'imse' ], // himself
[ /iun\b/, 'ium' ],
[ /(?<=\b[a-z]\w+l)don/, 'dom' ], // no lowercase ends ldon
[ /(?<=[Nn])unber/, 'umber' ],
[ /stanp/, 'stamp' ],
[ /\bn(?=ores?\b|oreover)/, 'm' ],
// n -> o
[ /\bnf/, 'of' ],
// n -> ri
[ /scnb/, 'scrib' ],
// n -> u
[ /\bont (of|the|to|in|a|that|and|for|with|by)\b/, 'out $1' ], // ont may be suffix, filter by common ngram
[ /([Nn])nm(?!a)/, 'num' ],
[ /snb/, 'sub' ],
[ /onsly\b/, 'ously' ],
[ /(C|c|w|W|Sh|sh)onld/, '$1ould' ],
[ /\b([Th])h(r?)ongh/, '$1h$2ouogh' ], // though, through-
[ /\b([Aa])bont\b/, '$1bout' ],
[ /thongh/, 'though' ],
[ /\b([Cc])onrt/, '$1ourt' ], // court
// na -> m
[ /\b([Hh])ina(|self)\b/, '$1im$2' ],
// ni -> m
[ /(?<=\b|[Hh]ere-?|[Hh]ence-?)froni(?=\b|age|ward)/, 'from' ],
[ /(?<=\b[Ww])honi/, 'hom' ],
[ /\bhini/, 'him' ],
[ /(?<=in|)hunian/, 'human' ],
[ /\bnian(?=u|ly|kind)/, 'man' ], // not too general, mind pinyin
[ /\brenio/, 'remo' ],
[ /\bni(?=ak)/, 'm' ],
[ /niouth/, 'mouth' ], // mouth, Plymouth, etc
[ /(?<=[Cc]o)ni(?=plet)/, 'm' ], // complete
// ni -> m
[ /\bnie\b/, 'me', { notLangs: [ 'de', 'pl', 'zh-pinyin' ] } ],
[ /\bnian/, 'man', { notLangs: [ 'zh-pinyin' ] } ],
[ /\btians/, 'trans', { notLangs: [ 'zh-pinyin' ] } ],
// nn -> rm
[ /(?<=[Ff])onn(?!ish)/, 'orm' ], // formula, form, etc
// nv -> rw
[ /nva(?=y|rd)/, 'rwa' ], // afterward, Norway
// o -> a
[ /\bouth(or|en)/, 'auth$1' ], // authority...
[ /fovo(u?)r/, 'favo$1r' ],
[ /\b([Cc])ous([ae])/, '$1aus$2' ], // cause
// o -> c
[ /jeot/, 'ject' ],
[ /(?<=[Oo])oo(?=as|i[cp]|u[pl]|lu)/, 'cc' ],
[ /(?<=[Oo])co(?=asi|lus|lud|upa|upi|ur)/, 'cc' ], // occasion, occur,
[ /(?<=[Ss]uc)oe/, 'ce' ], // success
[ /(?<=[Aa]c)o(?=us[ae]|ept|iden|ord)/, 'c' ], // accuse, accept
[ /(?<=[Aa]r|ac)oh(?=[io])/, 'ch' ], // archi..., Gracchi,
// o -> e
[ /(?<=dis|\b)rospect/, 'respect' ],
[ /turo\b/, 'ture' ],
[ /([d])loss/, '$1less' ], // endless
[ /\b([Mm])ako\b/, '$1ake' ],
[ /\b([Mm])ado\b/, '$1ade' ],
[ /noss(?=\b|es|like)/, 'ness' ],
[ /\bcomo\b/, 'come', { notLangs: [ 'es' ] } ],
// o -> n
[ /tioos/, 'tions' ], // could be o -> u, but choose one
[ /iog(|s)\b/, 'ing$1' ],
// o -> u
[ /egolar/, 'egular' ], // regular
// ol -> d
[ /nolix/, 'ndix' ],
// p -> d
[ /ecorp([^o]?)\b/, 'ecord$1' ],
// p -> f
[ /\bop\b/, 'of' ],
// P -> F
[ /\bP(ee)\b/, 'F$1' ],
[ /\bOP\b/, 'OF' ],
// p -> g
[ /inp\b/, 'ing' ],
[ /(?<!u)prap/, 'grap' ],
// p -> n
[ /apd\b/, 'and' ],
// p -> o
[ /prth/, 'orth' ],
// P -> ?
[ /([a-z])P\b/, '$1?' ],
// q -> o
[ /qf/, 'of' ],
// Q -> G
[ /\bGu(?=ite?|ee[rn]|i[dzvxp]|ir[^o]|in[tq]|iet|ick|ibb)/, 'Qu' ],
// r -> c
[ /jert/, 'ject' ], // object, etc
[ /(\w)reive/, '$1ceive' ], // perceive, receive, etc
[ /anrs\b/, 'ani\'s' ], // names ending in ani + 's
// r -> i'
[ /prs\b/, 'pi\'s' ],
// r -> n
[ /\bupor\b/, 'upon' ],
// r -> v
[ /(he|[iasolurn])sire/, '$1sive' ],
[ /siveless/, 'siveness' ], // after sire->sive
[ /\b(M|m)orement/, '$1ovement' ],
[ /\b(G|g)orernment/, '$1overnment' ],
[ /\b([Oo])bserr/, '$1bserv' ],
// r -> t
[ /(?<=\b[Ii])r\b/, 't' ],
// r^ -> p
[ /\br\^/, 'p' ],
// ri -> n
[ /(?<=\b[Mm]e)ri\b/, 'n' ],
// ri -> u
[ /ectrial/, 'ectual' ],
// rj -> n
[ /\birj/, 'in' ],
// rn -> m
[ /([aie])urn\b/, '$1um' ],
[ /\brern/, 'rem' ],
[ /ernent/, 'ement' ],
[ /\brn/, 'm' ],
// s -> a
[ /grsph/, 'graph' ],
[ /csuse/, 'cause' ],
// s -> m
[ /\b([Ss])ees(ing|ingly|ed|s)\b/, '$1eem$2' ], // seemed
// sb -> sh
[ /\bsb(e|all)\b/, 'sh$1' ],
// sc -> g
[ /insc\b/, 'ing' ],
// t-> c
[ /ettual/, 'ectual' ],
[ /fetted/, 'fected' ],
// t -> f
[ /\bot\b/, 'of' ],
[ /fitty/, 'fifty' ],
// t -> i
[ /shtp/, 'ship' ],
[ /(?<=[Bb]u|[Cc]h|[Mm])tld/, 'ild' ],
[ /(?<=[Bb]u|[Gg]u?|[Tt]|[Ss]|[Ff]|[Ww])tlt/, 'ilt' ],
[ /\btn\b/, 'in' ],
// T -> nothing (and some I -> nothing)
[ /\bw [IT] (?=as|hich|hen|hether|ho)/, 'w' ], // w T as > was, etc
// T -> I
[ /(?<!\bw )\bT(?=\b|t)/, 'I' ],
[ /T(?=reland|rish)/, 'I' ],
// t -> l
[ /abte\b/, 'able' ],
[ /(?<=[WwCc]|[Ss]h)outd/, 'ould' ],
// t -> r
[ /\b(?<=[Ff])ot(?!h|o|i|u|m|c)/, 'or' ],
[ /\b(?<=[Ff])t(ance|ench)/, 'r' ],
[ /ntt(?=y|ies)/, 'ntr' ], // country
[ /(?<=[Ll]ive)t(?=s|p|\b)/, 'r' ], // liver, Liverpool
// T -> Y
[ /\b(?<=JUL|JOURNE|M|WA)T\b/, 'Y' ],
[ /\b(?<=MON|TUES|WEDNES|THURS|FRI|SATUR|SUN|)DAT\b/, 'DAY' ],
// ti -> h
[ /\b([Oo])ttier(?=\b|[^eis])/, '$1ther' ],
// ti -> n
[ /tioti/, 'tion' ],
// ti -> u
[ /\btipon/, 'upon' ],
// to -> h
[ /\bttoe(?![ds]\b)/, 'the' ],
// U -> li, see h/U
[ /(?<=\b|[a-z])Uon(?=s?)/, 'lion' ],
[ /(?<=[a-z])Ung(?=s?)/, 'ling' ],
// u -> a
[ /Junu([^b])/, 'Janu$1' ],
[ /\bund\b/, 'and' ],
// u -> c
[ /([Dd])ouum/, '$1ocum' ],
// u -> h
[ /(?<=\b[Tt])u(?=e[^s]|at\b)/, 'h' ], // the, there, these, etc (not Tuesday)
// u -> n
[ /\baud\b/, 'and' ],
[ /meut(\b|[^e])/, 'ment$1' ],
[ /siau(|s)\b/, 'sian$1' ], // Persians...
[ /\b(P|p)ersou(|s)/, '$1erson$2' ],
[ /erument/, 'ernment' ],
[ /([Jj])uuc/, 'junc' ],
[ /taiu/, 'tain' ],
[ /\biu(|to|ward)\b/, 'in$1' ],
[ /\bauy(|where|body)\b/, 'any' ],
[ /\biuto\b/, 'into' ],
[ /kuow/, 'know' ],
[ /iug(s|ed|ly|)\b/, 'ing$1' ],
[ /auswer/, 'answer' ],
// u -> ii
// [ /(?<=\b[clxv]*)u(?=i*)/, 'ii' ], // roman numerals
// "U" -> "ll" when preceded by a lowercase letter.
// "U" -> "li"
[ /(?<=[a-z])U(?=c)/, 'li' ], // relic
[ /(?<=[a-z])U(?!c)/, 'll' ], // not relic
// un -> m
[ /\bimuned/, 'immed' ],
// ui -> m ... "must", etc
[ /\bui(ust)\b/, 'm$1' ],
// v -> r
[ /[Mm]emov/, 'memor' ],
// v -> u
[ /\b([Nn])vm/, '$1um' ],
// v -> y
[ /\bv(ear|our|ou)s?\b/, 'y$1' ],
[ /\b(B|b|M|m|the)v/, '$1y' ],
[ /\b(A|a)nv(\b|w)/i, '$1ny$2' ],
[ /vield/, 'yield' ],
[ /encv\b/, 'ency' ],
[ /\b(?<=[GgHh])aye\b/, 'ave' ],
[ /([Aa])bbev/, '$1bbey' ],
[ /demv\b/, 'demy' ],
[ /mplov/, 'mploy' ], // employ-...
[ /itv\b/, 'ity' ],
[ /(?<=[Vv])erv\b/, 'ery' ],
[ /(?<=(Mon|Tues|Wednes|Thurs|Fri|\b)da)v(?=s?\b)/, 'y' ],
// v -> w
[ /\bvr/, 'wr' ],
// v^ -> w
[ /\bv[\^/]([a-z])/, 'w$1' ],
// vc -> we
[ /\bvc\b/, 'we' ],
// vd -> wi
[ /vd(ll|th)/, 'wi$1' ],
// V -> m
[ /\bV(iss|rs|r)\b/, 'M$1' ],
// Vh ->Wh
[ /\bVh/, 'Wh' ],
// V' -> W
[ /\bV'/, 'W' ],
// Vi -> M
[ /\bVir\b/, 'Mr' ],
// vir -> w
[ /hovir(?!u)/, 'how' ],
// vn -> wi
[ /vn(ll|th)/, 'wi$1' ],
// VV -> W
[ /\bVV(e)\b/, 'W$1' ],
// w -> m
[ /mewt(?!tide)/, 'ment' ],
// w r -> w (not sure what this is about)
[ /\bw r (?=e\b|[aeoiu]\w)/, 'w' ],
// X -> N
[ /\bX(?=o)/, 'N' ],
// xv -> w
[ /xvho/, 'who' ],
[ /xvay/, 'way' ],
[ /txvo/, 'two' ],
// y -> v
[ /([Ss])ery(a|i)/, '$1erv$2' ],
[ /tiye(|ly|ness|nesses|s)\b/, 'tive$1' ],
[ /eyies\b/, 'evies' ],
[ /(?<=\b(?:[Hh]a|[BbGg]ra))ye\b/, 've' ], // have, grave, brave
[ /\b([Oo])by(?=\B)/, '$1bv' ],
[ /\b(?<=Gene)ya/, 'va' ],
[ /\bevent/, 'event' ],
[ /vent(?=\b|s|ed|or|ing|y\b|ies|ral|ro|ur|il|ri)/, 'vent' ],
// Y -> T
[ /\bY(?=he)/, 'T' ],
// Y -> V
[ /\b(?<=GENE)YA/, 'VA' ],
[ /\bEYENT/, 'EVENT' ],
[ /VENT(?=\b|S|ED|OR|ING|Y\b|IES|RAL|RO|UR|IL|RI)/, 'VENT' ],
// z -> x
[ /\bezc/, 'exc' ],
// -> Rome/Roman
[ /(E|K)om(e|an|ish)([ .,\n])/, 'Rom$2$3' ],
// d', l', m', n' (not s', or english possesives get messed with)
[ /(^|\s)([MmDdLlNnJjSsCc]|[Qq]u|[Jj]usqu)(' | ')(?=[AaEeIiOoUuÁáÀàéÉèÈ])/, "$1$2'" ]
];
process_editor( editor, new PartialWordRegexProcessor( reps ) );
};
const do_multiword_fixes = function ( editor ) {
let reps = [
// hyphens more likely to be em-dash
[ /(<?=[a-z])-(the)\b/, '—$1' ],
// Missing spaces
// theCap unlikely to be right
[ /\b(a|an|of|by|the)(?=[A-Z])/, '$1 ' ],
// single cap in a word probably a dropped space
// watch for Mc/Mac
// needs lookbehind really
[ /\b(\w[a-z]*[abd-z])([A-Z][a-z]+\b)/, '$1 $2' ],
// ance is a suffix when it's not ancestor's prefix
[ /[\s-]ance(?! st[or])\b/, 'ance' ],
[ /\bal though/, 'although' ],
// and<dropped space>
// not many words start and
[ /\band((?=[a-z])[^raoei])/, 'and $1' ],
[ /\bbet ween/, 'between' ],
// I
[ /I(am\b|had|was|will|can|shall|did)/, 'I $1' ],
// he
[ /([Hh]e)(had|did|can|will|was)/, '$1 $2' ],
// him
[ /(?<=\b([Hh]im))t/, ' t' ], // e.g. himto -> him to
[ /notbe/, 'not be' ], // cannot be, not being, ...
[ /([deos])n(' | ')t\b/, '$1n\'t' ],
[ /\bcom m/, 'comm' ],
[ /(<?=in|\b)com par/, 'compar' ],
// government can only be -a, -s, -e
[ /(overnment)((?=\w)[^sae])/, '$1 $2' ],
[ /((?=\w)[^sa])may/, '$1 may' ], // dismay/gamay are the only words end in may
[ /\bme(of|to|for|that)\b/, 'me $1' ],
[ /(s|t)my\b/, '$1 my' ], // -my isn't always a likey suffix
[ /\bof(a|b|c|d|g|m|n|p|s|w)/, 'of $1' ], // of my/self, etc words that can't start of-
[ /\bof(our|my|some|him|her|his)\b/, 'of $1' ],
// of merged left, careful of Russian names...
[ /(Earl|Duke|Queen|King|Baron|most|all|some|many)of/, '$1 of' ],
[ /([a-z])which/, '$1 which' ], // only wrong for everwhich
// no word ends -many except overmany
[ /([^Oo]?[^v]?[^e]?[^r\s])many/, '$1 many' ],
// she
[ /([Ss]he)(had|did|will|was)/, '$1 $2' ],
[ /\bthus(?!ly|\b)/, 'thus ' ], // no words start thus
// some obvious loss of spaces after 'the'
[ /\bthe(?=h|me[nm]|mer[c]|mo|im|un|wh)/, 'the ' ],
// and before 'the'
[ /\b(\w[^aoniy\s])the\b/, '$1 the' ],
// before 'to'
[ /\b(thing)to\b/, '$1 to' ],
[ /(u|n|r) (dices?)\b/, '$1$2' ],
[ /\bun der/, 'under' ],
[ /\brene w(ed|al|abl)\b/, 'renew$1' ],
[ /\bre turn/, 'return' ],
// words ending in cious that lost a space
[ /cious((?=[a-z])[^enl])/, 'cious $1' ],
// Spurious spaces
[ /\b(P|p)ro ceed/, '$1roceed' ],
[ /\b(P|p)ro ced/, '$1roced' ],
[ /(C|c)on cl/, '$1oncl' ], // con clude
[ /(un)?ans wer(a|e|s|\b)/, '$1answer$2' ],
[ /same(a|b|c|f|g|h|i|j|k|m|o|p|q|u|v|w|x|y|z)/, 'same $1' ],
[ /\bho w/, 'how' ], // however...
[ /\b(dis|)satis fact/, '$1satisfact' ],
[ /\bendo (wed|wing|wments?)/, 'endo$1' ],
[ /\bre[ -](quest|quire|solute)/, 're$1' ],
[ /\bwasnot\b/, 'was not' ],
[ /\b(ly)(worked)\b/, '$1-$2' ],
// missing hyphens
[ /\binchief(?=s?\b)/, 'in-chief' ],
[ /(?<=y)public(?=s?\b)/, '-public' ], // notary-public, ...
// Lone quotes at the start of a quotation
[ /(?<=(said|answered|replied|shouted|thought|whispered|murmured|muttered|), ") /, '' ],
// spurious punctuation, eg why. not, but avoid e.g. i.e. etc
[ /([a-z]{3,})\. ([a-z])/, '$1 $2' ]
];
process_editor( editor, new PartialWordRegexProcessor( reps ) );
// These are things that are never suffixes
// eg. hecould -> he could
reps = [
/(c|sh|w)ould(n't)?/
];
process_editor( editor, new BannedSuffixProcessor( reps ) );
// These can never be prefixes
// so insert spaces after then
reps = [
/[Aa](?=number|bond\b|comm|rece|reci[^b])/,
/a(?=dele)/,
/be(?=my)/,
/but(?=al)/, // but all, but always
/come(?=to)/,
/great(?=m|p|r)/,
/[HhSsGg]ave(?=my)/, // h/gave my/self
/me(?=wit|tow)/,
/means/,
/of(?=the)/,
/sent(?=as)/,
/some(?=[cm])/,
/that(?=can|d|w)/, // that will
/the(?=mes|tr|e\w)/,
/(?:un|)usual(?!s|ness|ly)/,
/I(?=h[eiou])/,
/I(?=ha[^b])/, // I have/had
/with(?=a\b|a[^lm]|all)/,
/with(?=his|her|it|th|ha)/
];
process_editor( editor, new BannedPrefixProcessor( reps ) );
// if we see these on their own, they are prefixes of the next word
// These can be slightly aggressive, as they only fire if the prefix is
// already isolated - they won't break up existing words
let orphans = [
/(a|fo)llo/, // allocate, follow
/(un|)acknow/,
/(|[Ii]n)conse/, // consequence, consecrate
/circum/,
/combin?/,
/(|[Ii]n)compa/,
/(|[iI]n)comple/,
/(|[Ii]n)corp/,
/\w*corres?/,
/diffi/, // difficult, diffident
/dis/, // very few words end dis, so an orphan is likely a prefix
/decla?/, // ration can't be a simple suffix
/ered/,
/exper?/,
/helio/,
/inex/,
/medi/, // medicine/s, medical
/misbe/,
/(|in)oppor/,
/(|dis|co-?|acc|in|sub|super)ordin?/,
/[Pp]arti/,
/[Pp]hilo/,
/(|im|mal)prac/,
/(|im)practi/,
/pre/, // pre is occasionally a suffix, but it's
/(|un)[Pp]rinci/,
/reca/,
/(|p|un|under)recom/, // recommend
/repre/,
/(|un|tran)sub/,
/suc/, // success...
/(|un)sug/, // suggest, sugary../
/sur/, // sur-
/trans/,
/undis/,
/whatso/
];
process_editor( editor, new OrphanPrefixProcessor( orphans ) );
// if we see these on their own, they're suffixes of the prior word
orphans = [
/astic/,
/ated/,
/atory/,
/(|ond|ti)ar(y|ies)/,
/tably/,
/butors?/,
/cating(|ly)/,
/cellation(|s)/,
/cien(cy|t)/,
/ciples?/,
/dences?/,
/derable/,
/digent(|s)/,
/dit(y|ies)/,
/drawals?/, // only withdrawal
/ested(|ly|ness)/,
/esque(|ly)/,
/ficial\w*/,
/geous(|ly|ness|nesses)/,
/gences?/,
/hend(|s|ing)/,
/iast\w*/,
/ings?/, // ing is rarely a prefix, much more likely to be -ing if it occurs alone
/lants/,
/lated/,
/lative(s|ly|)/, // comp-, decla-
/ligent(|ly|sia|sias)/,
/mations?/, // not motions
/munication?/,
/ments?/,
/mence\w*/, // commmence
/mitted(|ly|ness)/,
/nect(ed|ions?)/,
/nence/,
/nese/,
/nien(ce|ces|ced|t)/,
/m?on(ing|ed)/, // summoned, commisioned...
/pan(y|ies)/,
/pensat\w+/, // compensate
/plet(ed|ion|ions)/,
/politan\w*/,
/pl?oration(|s|al)?/,
/rative(s|ly|)/, // comp-, decla-
/rit(ies|y)/,
/rence(|d|s)/,
/saries/, // anniversaries...
/sion\w*/,
/siderable\w*/, // avoid sideral/sideration
/sume(\b|[^r]\w*|r[^i]\w*)/, // avoid -sumeria
/stantly/,
/tain(ed|s)/,
/[as]tr[au]ction(|s|al|ary|ally)/,
/[szt]?[aoiue]?tion(|s|al|ally)/, // not ration
/tages?/,
/ti[vn]ely/,
/tinual(|ly|ness|ity)/,
/tinuous(|ly|ness)/,
/b?ilit(ies|y)/,
/vid(es|ing)/,
/wered/
];
process_editor( editor, new OrphanSuffixProcessor( orphans ) );
};
const do_foreign_italics = function ( editor ) {
const reps = [
/\bad (hoc|.*um|.*em)\b/,
/de facto/,
/quid pro quo/,
/locum tenens/,
/\b[Ii]bid\b/
];
process_editor( editor, new ItaliciseProcessor( reps ) );
};
const do_whole_words_reps = function ( editor ) {
// simple whole-word replacements
const reps = [
];
process_editor( editor, new WholeWordRegexProcessor( reps ) );
};
const doLongSReplacements = function ( editor ) {
const long_s_reps = [
// fix bad long se replacements
[ /ƒ/, 'f' ],
[ /ʃ/, 's' ],
[ /([^i])fic\b/, '$1sic' ],
[ /([Ee])aft/, '$1ast' ],
[ /([W])eft/, '$1est' ], // assume Weft is West, but weft is like fabric
[ /(af|un)?focia/, '$1socia' ],
[ /(?<=[Aa])ff(embl|ign)/, 'ss$1' ], // assign, assemble..
[ /(A|a)nfwer/, '$1nswer' ],
[ /(ef)?fent/, '$1sent' ], // essential, sent, sentinel
[ /(other|like)wife/, '$1wise' ],
[ /\bfide\b/, 'side' ],
[ /\bfo\b/, 'so' ],
[ /\breft/, 'rest' ],
[ /([Aa])bfo/, '$1bso' ],
[ /ccef[fs]/, 'ccess' ],
[ /bfurd/, 'bsurd' ],
[ /affif/, 'assist' ],
[ /aff(um|ur|er)/, 'ass$1' ], // assume, assure
[ /(?<=A|a)fc/, 'sc' ], // ascent
[ /Afia/, 'Asia' ],
[ /(?<=A|a)fk/, 'sk' ], // ask
[ /aftard/, 'astard' ],
[ /aftic/, 'astic' ],
[ /afty/, 'asty' ],
[ /([Aa])lfo/, '$1lso' ],
[ /([Aa])pfe/, '$1pse' ],
[ /([Aa])ufp/, '$1usp' ],
[ /baffy/, 'bassy' ],
[ /([Bb])afe/, '$1ase' ],
[ /([Bb]|[Cc]r)eft/, '$1est' ],
[ /([Cc])afua/, '$1asua' ],
[ /([Cc])auf/, '$1aus' ],
[ /([Cc])eaf(?!a)/, '$1eas' ],
[ /ceff/, 'cess' ], // necessary
[ /cefs\b/, 'cess' ], // princess, process
[ /([Cc])heft/, '$1hest' ],
[ /Chrif/, 'Chris' ],
[ /cife/, 'cise' ],
[ /([Cc])laf[fs]/, '$1lass' ],
[ /([Cc])lofe/, '$1lose' ],
[ /([Cc])onf(id|t|eq)/, '$1ons$2' ], // const, conseq...
[ /([Cc])ourfe/, '$1ourse' ],
[ /([Cc])oft/, '$1ost' ],
[ /([Cc])roff\B/, '$1ross' ], // cross-
[ /([Cc])rofs\b/, '$1ross' ], // cross
[ /([Dd])efcr/, '$1escr' ],
[ /dorf(e|es|ed|ing|ings|ment)/, 'dors$1' ],
[ /efer([vt])/, 'eser$1' ], // deserve-, desert-
[ /([dD])if([ocprgqst]|ad)/, '$1is$2' ], // dis-
[ /\b([dD])if([^f]\w)/, '$1is$2' ],
[ /([Dd])iffol/, '$1issol' ],
[ /([Dd])efir/, '$1esir' ],
[ /efour/, 'esour' ],
[ /offef[fs]/, 'ossess' ],
[ /feffion/, 'session' ], // session (possesion comes later)
[ /(?<![A-Z]|ff|\b)eff(|ed|ion|ing|ly)/, 'ess$1' ], // express, etc
[ /([Ee])fpe/, '$1spe' ], // especial
[ /([Ee])fq/, '$1sq' ],
[ /(?<=R|r|t|l|p)egift/, 'egist' ], // regist.., strategist, etc
[ /(?<=en)lift/, 'list' ],
[ /fenf(e|es|ed|ing|ings)\b/, 'sens$1' ],
[ /enf(e|es|ed|ing|ings)\b/, 'ens$1' ],
[ /([Bb])eft(\b|ed|ing)/, '$1est$1' ],
[ /([^kgrdw])eft\b/, '$1est' ], // -est
[ /efide/, 'eside' ],
[ /(?<=R|r)efort/, 'esort' ],
[ /(?<=R|r|t|l|p)egift/, 'egist' ], // regist.., strategist, etc
[ /([Ee])fta/, '$1sta' ], // establish
[ /([Ee])fti/, '$1sti' ], // estimate
[ /enfes/, 'enses' ],
[ /ennf/, 'enns' ], // Pennsylv etc
[ /erfal/, 'ersal' ],
[ /erfon/, 'erson' ],
[ /erfua/, 'ersua' ],
[ /erfue/, 'ersue' ],
[ /erfui/, 'ersui' ],
[ /eruf/, 'erus' ],
[ /fa(cr|fe|ga|id|le|lut|lt|tis|w\b|nds?\b)/, 'sa$1' ],
[ /\bfay/, 'say' ],
[ /\bfa(ve|vi)/, 'sa$1' ],
[ /(?<=F|\bf)alf/, 'als' ], // false
[ /fatif(?!e)/, 'satis' ],
[ /fca([^s])/, 'sca$1' ], // scarce, scant, etc (not briefcase)
[ /fchem/, 'schem' ],
[ /fc(ie|ious|ure|en|rib|rip)/, 'sc$1' ], // science, conscious, secure
[ /fenf/, 'sens' ],
[ /fe(a\b|af|cl|co|iz)/, 'se$1' ], // season, seclude, second
[ /fee(m|n|ing)/, 'see$1' ], // seen, seem
[ /fe(ek|gr|duc)/, 'se$1' ],
[ /felec/, 'selec' ],
[ /fel(f|v)/, 'sel$1' ],
[ /(?<=[Aa]b|[Ii]n)fence/, 'sence' ],
[ /fepar/, 'separ' ],
[ /feri([eo])/, 'seri$1' ],
[ /fervi/, 'servi' ],
[ /\bfet(|ting|s|ter)\b/, 'set' ],
[ /fettle(\b|m|s)/, 'settle$1' ], // fettle is a word, but settle is way more common
[ /feve(ra|n)/, 'seve$1' ], // severla, seven
[ /fhew/, 'shew' ],
[ /(?<=\ba?)fide(?=s?\b)/, 'side' ],
[ /fing(le|u)/, 'sing$1' ], // single, singular
[ /fis\b/, 'sis' ], // -sis
[ /ffidu/, 'ssidu' ], // Assiduous
[ /fh(al|ut|ip|o)/, 'sh$1' ],
[ /inifter/, 'inister' ],
[ /fidera/, 'sidera' ], // considerable/ation/ate
[ /fift(?!h)/, 'sist' ], // subsist, consist
[ /filen/, 'silen' ],
[ /fign/, 'sign' ],
[ /fimi/, 'simi' ],
[ /fince/, 'since' ],
[ /fion/, 'sion' ],
[ /firft/, 'first' ],
[ /fite\b/, 'site' ],
[ /fitive/, 'sitive' ],
[ /fitu/, 'situ' ],
[ /flaught/, 'slaught' ],
[ /flowl/, 'slowl' ],
[ /flowne/, 'slowne' ],
[ /fm(an|en|all|oth|ooth)/, 'sm$1' ], // small, helmsmen, smooth
[ /focie/, 'socie' ],
[ /fole/, 'sole' ],
[ /foli/, 'soli' ],
[ /folv/, 'solv' ],
[ /fome/, 'some' ],
[ /foon/, 'soon' ],
[ /foph/, 'soph' ], // -sopher/y
[ /fourc/, 'sourc' ],
[ /fouth/, 'South' ],
[ /fov/, 'sov' ],
[ /fpade/, 'spade' ],
[ /fpawn/, 'spawn' ],
[ /fpeak/, 'speak' ],
[ /fpec/, 'spec' ],
[ /fpee/, 'spee' ],
[ /fpir/, 'spir' ], // spirir, spiral,
[ /ft(air|an|at|eem|ep|ill|on|oo|r|ud|y)/, 'st$1' ],
[ /\bft(\w)/, 'st$1' ],
[ /fubf/, 'subs' ], // do before fub
[ /fub/, 'sub' ],
[ /fucc/, 'succ' ],
[ /fuch/, 'such' ],
[ /fued/, 'sued' ],
[ /\bfu(e|es|ings?)\b/, 'su$1' ],
[ /fuf(p)/, 'sus$1' ],
[ /fuff/, 'suff' ],
[ /fund(?!rais)/, 'sund' ],
[ /fumm/, 'summ' ], // summit, summary
[ /fuit/, 'suit' ],
[ /fuper/, 'super' ],
[ /fupp/, 'supp' ],
[ /fu(re|rv)/, 'su$1' ],
[ /fw(ay|ear|orn)/, 'sw$1' ],
[ /fyf/, 'sys' ],
[ /fym/, 'sym' ],
[ /grefs/, 'gress' ],
[ /hift/, 'hist' ],
[ /(?<=[Hh])(ea|o|oa|ou)rf/, '$1rs' ], // house, hearse, horse
[ /i[sf]cuff/, 'iscuss' ],
[ /ifh/, 'ish' ],
[ /ifm\b/, 'ism' ],
[ /ifo\b/, 'iso' ],
[ /ifon/, 'ison' ],
[ /iftic/, 'istic' ],
[ /([Ii])ffu/, '$1ssu' ],
[ /illuf/, 'illus' ],
[ /(I|i)nft/, '$1nst' ],
[ /\b(?<=i|I)fl/, 'sl' ], // isle, island
[ /Jefus/, 'Jesus' ],
[ /(?<=J|j|I|i)urif/, 'uris' ],
[ /([Jj])uft/, '$1ust' ],
[ /([Ll])aft/, '$1ast' ], // last, lastly, etc
[ /lefia/, 'lesia' ],
[ /([Ll])egif/, '$1egis' ], // legislation...
[ /([^ie])efs/, '$1ess' ], // -ess
[ /(?<=l|L)eff/, 'less' ], // -ess-
[ /lifle/, 'lisle' ],
[ /lifh/, 'lish' ],
[ /lufiv/, 'lusiv' ],
[ /([MmPp])afs\b/, '$1ass' ],
[ /([Mm])i(fs\b|ff\B)/, '$1iss' ], // miss, missing
[ /([Mm])i(f\B)/, '$1is' ], // mistake
[ /Missifippi/, 'Missisippi' ],
[ /Missiffippi/, 'Mississippi' ],
[ /([Mm])oft/, 'most' ],
[ /mongft/, 'mongst' ],
[ /([Mm])uft/, 'must' ],
[ /nefe/, 'nese' ],
[ /nefs/, 'ness' ],
[ /nfate/, 'nsate' ],
[ /nfel(?=\b|s|led|l[oe]rs?)/, 'nsel' ],
[ /nfive/, 'nsive' ],
[ /oaft/, 'oast' ], // coast, etc
[ /obf/, 'obs' ],
[ /([Oo])bfe/, '$1bse' ], // observ
[ /ofed/, 'osed' ],
[ /offi/, 'ossi' ], // possible
[ /ofition/, 'osition' ], // position, etc.
[ /ofity/, 'osity' ],
[ /oftil/, 'ostil' ], // hostile
[ /ouf\b/, 'ous' ],
[ /oufly/, 'ously' ],
[ /([Pp])aft/, '$1ast' ],
[ /hraf/, 'hras' ], // phrase
[ /paff/, 'pass' ], // pass/age, for pafs, see mafs
[ /([Pp])leaf/, '$1leas' ],
[ /([Pp])of(e|t)/, '$1os$2' ], // post, pose, compose...
[ /(?<=P|p)urfu/, 'ursu' ],
[ /(?<=R|r)ef([pfs]|en|ume|ump)/, 'es$1' ],
[ /([Rr])eleaf/, '$1eleas' ],
[ /(?<=R|r)aif(e|i)/, 'ais$1' ], // raising, raised/r
[ /\b([Aa]r|[Rr])if([ie])/, '$1is$2' ], // a/rising/ed/es
[ /rofec/, 'rosec' ], // prosecute
[ /rofef([sf])/, 'rofess' ],
[ /rofp/, 'rosp' ],
[ /urpof/, 'urpos' ],
[ /([Qq])ueft/, '$1uest' ],
[ /reafo/, 'reaso' ],
[ /refea/, 'resea' ],
[ /refi/, 'resi' ],
[ /([Tt])afte/, '$1aste' ],
[ /(?<=T|t)eft/, 'est' ],
[ /terfect/, 'tersect' ], // intersect, but not perfect, etc
[ /hefe/, 'hese' ], // these
[ /([Hh])ofe/, '$1ose' ], // those, whose
[ /tereft/, 'terest' ],
[ /traft/, 'trast' ],
[ /ranf/, 'rans' ], // trans-
[ /ufe/, 'use' ],
[ /uftom/, 'ustom' ],
[ /vaft/, 'vast' ],
[ /(?<=V|v)erf/, 'ers' ], // verse, versus
[ /([Vv])eff/, 'vess' ],
[ /verf([eyo])/, 'vers$1' ], // verse, verso -versy
[ /vife/, 'vise' ], // advise..
[ /([Vv])ifi/, '$1isi' ],
[ /ifdom/, 'isdom' ],
[ /xift/, 'xist' ]
];
process_editor( editor, new PartialWordRegexProcessor( long_s_reps ) );
};
const template_cleanup = function ( editor ) {
const header = editor.forField( '#wpHeaderTextbox' );
const footer = editor.forField( '#wpFooterTextbox' );
// {{c}} to {{center}}
editor.replace( /{{c\|/g, '{{center|' );
header.replace( /{{c\|/g, '{{center|' );
footer.replace( /{{c\|/g, '{{center|' );
// {{rh}} to {{RunningHeader}}
header.replace( /\n?{{rh\|/gi, '{{RunningHeader|' );
// more cleanup
editor
// {{hws}} & {{hwe}} expanded
.replace( /{{hws\|/g, '{{hyphenated word start|' )
.replace( /{{hwe\|/g, '{{hyphenated word end|' )
// {{di}} expanded
.replace( /{{di\|/g, '{{dropinitial|' )
// {{hi}} expanded
.replace( /{{hi\|/g, '{{hanging indent|' )
// {{sm}} expanded
.replace( /{{sm\|/g, '{{smaller|' )
// {{...}} replaced
// expand diacritical templates
// .replace(/{{\.{3}}}/g, '…')
// expand diacritical templates
// eslint-disable-next-line no-useless-concat
.replace( /{{(ae|oe|\w[:`'~^-])}}/g, '{' + '{subst:$1}}' )
// convert {{—}} to —
.replace( /{{—}}/g, '—' );
// M<sup>c</sup> to {{Mc}}
editor.replace( /M<sup>c<\/sup>/g, '{{Mc}}' );
header.replace( /M<sup>c<\/sup>/g, '{{Mc}}' );
// section tag fix
editor.replace( /<section (begin|end)=(\w[^/]+)\/>/g,
'<section $1="$2"/>' );
// refs don't have space before them
editor.replace( /\s<ref/g, '<ref' );
};
const do_extra_functions = function ( editor ) {
const header = editor.forField( '#wpHeaderTextbox' );
const footer = editor.forField( '#wpFooterTextbox' );
Cleanup.cleanupFunctions.forEach( function ( v ) {
v( editor, header, footer );
} );
};
const do_replaceSmartQuotes = function ( editor ) {
// replace smart quotes
editor
.replace( /“ /g, '"' )
.replace( / ”/g, '"' )
.replace( /[“”]/g, '"' )
.replace( /‘ /g, "'" )
.replace( / ’/g, "'" )
.replace( /[‘’]/g, "'" );
};
const collapse_line_breaks = function ( editor ) {
// stuff to do only if the page doesn't contain a <poem> tag:
if ( editor.get().indexOf( '<poem>' ) === -1 ) {
// first, a hack! [T230415]
const short_line_thresh = Cleanup.shortLineThreshold;
if ( short_line_thresh > 0 ) {
const lines = editor.get().split( /\r?\n/ );
for ( let i = 0; i < lines.length - 1; i++ ) {
if ( ( lines[ i ].length < short_line_thresh ) &&
lines[ i ].match( /[.!?'"”’—]\s*$/ ) &&
lines[ i + 1 ].match( /\s*['"“‘A-Z0-9]/ ) ) {
lines[ i ] += '\n';
}
}
editor.set( lines.join( '\n' ) );
}
editor
// remove single line breaks; preserve multiple.
// not if there's a tag, template, table syntax either side of line break
.replace( /([^>}\n])\n(?!( *\||[{}<]|\n|=|\*|#))/g, '$1 $2' )
// collapse sequences of spaces into a single space
.replace( / +/g, ' ' )
// two quotes are probably two lines
.replace( /" "/g, '"\n\n"' );
}
};
// Collapse paras where the second para starts lowercase (so it's probably
// bogus).
const collapseSuspiciousParagraphs = function ( editor ) {
if ( editor.get().indexOf( '<poem>' ) === -1 ) {
editor
// remove paragraph breaks if the second para starts lowercase
.replace( /\n\n+(?=[a-z])/g, ' ' );
}
};
const do_small_abbrs = function ( editor, abbr_list ) {
for ( const abbr of abbr_list ) {
let re_str = '';
let good = '';
for ( let i = 0; i < abbr.length; i++ ) {
re_str += abbr[ i ] + '[.,]? ?';
good += abbr[ i ] + '.';
}
re_str = '(\\s)' + re_str + '(?=\\s)'; // new word, but not in template
const re = new RegExp( re_str, 'g' );
const smallAbbrTemplate = 'asc';
good = `$1{{${smallAbbrTemplate}|${good}}}`;
editor.replace( re, good );
}
};
const markProofread = function () {
// eslint-disable-next-line no-jquery/no-global-selector
$( 'span.quality3 input' ).trigger( 'click' );
};
const set_summary = function ( summary_text ) {
// eslint-disable-next-line no-jquery/no-global-selector
$( '#wpSummary' ).val( summary_text );
};
const do_markProofread = function () {
// if doing cleanup, must be proofreading
markProofread();
if ( Cleanup.editSummary ) {
set_summary( Cleanup.editSummary ); // clear old summary
}
};
// The main cleanup function
// Editor: the templatescript editor object
function do_cleanup( editor ) {
// Any clenaups that need the context of the old line breaks
do_pre_collapse_cleanup( editor );
// Do this before line collapses
if ( Cleanup.remove_running_header ) {
process_editor( editor,
new RunningHeaderProcessor( Cleanup.runningHeaderPatterns ) );
}
// Do this first, so we can correct words across collapsed line breaks
collapse_line_breaks( editor );
if ( Cleanup.collapseSuspiciousParagraphs ) {
collapseSuspiciousParagraphs( editor );
}
// Generic cleanup
do_generic_cleanup( editor );
// OCR and scanno fixing
// Do the simple replacements first, as it's easier to write these
// if you don't have to guess what intermediate state the page is in
if ( Cleanup.additionalOcrReplacements.length > 0 ) {
process_editor( editor,
new PartialWordRegexProcessor( Cleanup.additionalOcrReplacements ) );
}
do_ocr_fixes( editor );
do_multiword_fixes( editor );
if ( Cleanup.italiciseForeign ) {
do_foreign_italics( editor );
}
if ( Cleanup.italicWords.length > 0 ) {
process_editor( editor, new ItaliciseProcessor( Cleanup.italicWords ) );
}
do_whole_words_reps( editor );
if ( Cleanup.doLongSReplacements ) {
doLongSReplacements( editor );
}
if ( Cleanup.doTemplateCleanup ) {
template_cleanup( editor );
}
if ( Cleanup.replaceSmartQuotes ) {
do_replaceSmartQuotes( editor );
}
do_small_abbrs( editor, Cleanup.smallAbbreviations );
// Any extra functions
do_extra_functions( editor );
if ( Cleanup.markProofread ) {
do_markProofread();
}
}
function do_cleanup_wrapper( editor ) {
log( DEBUG, 'Cleaning up...' );
try {
do_cleanup( editor );
} catch ( e ) {
log( ERROR, e );
}
log( DEBUG, 'Cleanup done.' );
}
function find_first_diff_pos( a, b ) {
const shorterLength = Math.min( a.length, b.length );
for ( let i = 0; i < shorterLength; i++ ) {
if ( a[ i ] !== b[ i ] ) {
return i;
}
}
if ( a.length !== b.length ) {
return shorterLength;
}
return -1;
}
function zip( arrays ) {
return arrays[ 0 ].map( function ( _, i ) {
return arrays.map( function ( array ) {
return array[ i ];
} );
} );
}
let test_test_to_restore = null;
function do_cleanup_test( editor ) {
const text = editor.get();
test_test_to_restore = text;
do_cleanup( editor );
const cleaned = editor.get();
// Load the "expected" subpage and see if the text matches
mw.loader.using( 'mediawiki.api' ).done( function () {
const api = new mw.Api();
api.get( {
action: 'query',
titles: mw.config.get( 'wgPageName' ) + '/expected',
prop: 'revisions',
rvprop: 'content',
rvslots: 'main',
formatversion: 2,
rvlimit: 1
} ).done(
function ( data ) {
const expected = data.query.pages[ 0 ].revisions[ 0 ].slots.main.content;
let colour = 'green';
if ( expected !== cleaned ) {
log( ERROR, "Expected text doesn't match!" );
const pairs = zip( [ expected.split( '\n' ), cleaned.split( '\n' ) ] );
for ( const pr of pairs ) {
if ( pr[ 0 ] !== pr[ 1 ] ) {
log( ERROR, 'Line mismatch' );
log( ERROR, `Expected: '${pr[ 0 ]}', Got: '${pr[ 1 ]}'` );
if ( pr[ 0 ] && pr[ 1 ] ) {
const indx = find_first_diff_pos( pr[ 0 ], pr[ 1 ] );
log( ERROR, pr[ 0 ].slice( indx ) );
log( ERROR, pr[ 1 ].slice( indx ) );
}
}
}
colour = 'red';
}
// eslint-disable-next-line no-jquery/no-global-selector
$( '.wikiEditor-ui' ).css( 'outline', '2px solid ' + colour );
} );
} ); // end using
}
function do_cleanup_test_restore( editor ) {
if ( test_test_to_restore ) {
editor.set( test_test_to_restore );
}
// eslint-disable-next-line no-jquery/no-global-selector
$( '.wikiEditor-ui' ).css( 'outline', '' );
}
function add_templatescript() {
$.ajax( '//tools-static.wmflabs.org/meta/scripts/pathoschild.templatescript.js', {
dataType: 'script',
cache: true
} ).then( function () {
const cleanup_entry = {
name: Cleanup.actionTitle,
position: 'cursor',
script: do_cleanup_wrapper,
enabled: true
};
if ( Cleanup.cleanupAccesskey ) {
cleanup_entry.accessKey = Cleanup.cleanupAccesskey;
}
const entries = [
cleanup_entry
];
if ( Cleanup.enableTesting ) {
entries.push( {
name: 'Test cleanup',
script: do_cleanup_test
} );
entries.push( {
name: 'Restore pre-cleanup',
script: do_cleanup_test_restore
} );
}
// eslint-disable-next-line no-undef
pathoschild.TemplateScript.add(
entries, {
category: Cleanup.portletCategory,
forNamespaces: Cleanup.activeNamespaces
} // common fields
);
} );
}
function really_run() {
log( DEBUG, 'Really_run' );
mw.hook( signature + '.config' ).fire( Cleanup );
if ( Cleanup.enable ) {
add_templatescript();
} else {
log( DEBUG, 'Cleanup disabled' );
}
}
function run() {
if ( Cleanup.started ) {
return;
}
Cleanup.started = true;
really_run();
}
$.when( mw.loader.using( 'user' ), $.ready ).always( run );
// eslint-disable-next-line no-undef
}( jQuery, mediaWiki ) );