User:Slaporte/Automated categorization

This is the code I use to categorize

  • 7/14/2010: Added new categories and an example script.

Code edit

$catlist = array(
	"tax" => "[[Category:United States Supreme Court decisions on taxation]]",
	"civil.*jurisdiction" => "[[Category:United States Supreme Court decisions on civil procedure]]",
	"Sherman" => "[[Category:United States Supreme Court decisions on antitrust]]",
	"antitrust" => "[[Category:United States Supreme Court decisions on antitrust]]",
	"copyright" => "[[Category:United States Supreme Court decisions on copyright]]",
	"impeach" => "[[Category:United States Supreme Court decisions on evidence]]",
	"class action" => "[[Category:United States Supreme Court decisions on class action]]",
	"ERISA" => "[[Category:United States Supreme Court decisions on ERISA]]",
	//"employee benefit" => "[[Category:United States Supreme Court decisions on ERISA]]",
	"treaty" => "[[Category:United States Supreme Court decisions on treaties]]",
	"constitutional" => "[[Category:United States Supreme Court decisions on constitutionality]]",
	"delegation of .* power" => "[[Category:United States Supreme Court decisions on separation of Powers]]",
	"discrimination" => "[[Category:United States Supreme Court decisions on civil rights]]",
	"§ 1983" => "[[Category:United States Supreme Court decisions on civil rights]]",
	"ethical obligation" => "[[Category:United States Supreme Court decisions on professional responsibility]]",
	"Rule 12\(b\)\(6\)" => "[[Category:United States Supreme Court decisions on civil procedure]]",
	"NEPA" => "[[Category:United States Supreme Court decisions on environmental aw]]",
	"environmental" => "[[Category:United States Supreme Court decisions on environmental law]]",
	"law enforcement" => "[[Category:United States Supreme Court decisions on criminal law]]",
	"First Amendment" => "[[Category:United States Supreme Court decisions on the First Amendment]]",
	"Second Amendment" => "[[Category:United States Supreme Court decisions on the Second Amendment]]",
	"Fourth Amendment" => "[[Category:United States Supreme Court decisions on the Fourth Amendment]]",
	"Fifth Amendment" => "[[Category:United States Supreme Court decisions on the Fifth Amendment]]",
	"Eighth Amendment" => "[[Category:United States Supreme Court decisions on the Eighth Amendment]]",
	"[Dd]ue [Pp]rocess" => "[[Category:United States Supreme Court decisions on due process]]",
	"community property" => "[[Category:United States Supreme Court decisions on property]]",
	"disparate-impact" => "[[Category:United States Supreme Court decisions on civil rights]]",
	"freedom of speech" => "[[Category:United States Supreme Court decisions on freedom of speech]]", 
	"time, place, and manner" => "[[Category:United States Supreme Court decisions on freedom of speech]]",
	"clear and present danger" => "[[Category:United States Supreme Court decisions on freedom of speech]]",
	"free exercise of religion" => "[[Category:United States Supreme Court decisions on religion]]",
	"Establishment Clause" => "[[Category:United States Supreme Court decisions on religion]]",
	"Sixth Amendment" => "[[Category:United States Supreme Court decisions on the Sixth Amendment]]",
	"Commerce Clause" => "[[Category:United States Supreme Court decisions on the Commerce Clause]]",
	"justiciable" => "[[Category:United States Supreme Court decisions on justiciability]]",
	"justiciability" => "[[Category:United States Supreme Court decisions on justiciability]]",
	"abortion" => "[[Category:United States Supreme Court decisions on abortion]]",
	"SEC" => "[[Category:United States Supreme Court decisions on securities]]",
	"arbitrary and capricious" => "[[Category:United States Supreme Court decisions on statutory interpretation]]",
	"complete diversity" => "[[Category:United States Supreme Court decisions on civil procedure]]",
);

function categoryGuess($txt, $list){
	$categories = array();
	foreach($list as $key => $cat) {
		if(preg_match("/$key/",$txt)){
			if(!in_array($cat,$categories)) {
				$categories[] = $cat;
			}
		}
	}
	
	if($categories == array()){
		$categories[] = "[[Category:Uncategorized United States Supreme Court decision]]";
	} else {
		$categories[] = "[[Category:Automated categorization]]";
	}
	
	return $categories;
}

Example script edit

<?php
function getRawText($url){
	$ch = curl_init();
	$timeout = 5;
	curl_setopt($ch,CURLOPT_URL,$url);
	curl_setopt($ch,CURLOPT_RETURNTRANSFER,1);
	curl_setopt($ch,CURLOPT_CONNECTTIMEOUT,$timeout);
	curl_setopt($ch, CURLOPT_USERAGENT, 'User-Agent: guessingcategory/1 [[User:Slaporte]]');
	$data = curl_exec($ch);
	curl_close($ch);
	
	return $data;
}
function categoryGuess($txt, $list){

	$categories = array();

	foreach($list as $key => $cat) {
	
		if(preg_match("/$key/",$txt)){
			if(!in_array($cat,$categories)) {
				$categories[] = $cat;
			}
		}
	
	}
	
	if($categories == array()){
		$categories[] = "[[Category:Uncategorized United States Supreme Court decision]]";
	} else {
		$categories[] = "[[Category:Automated categorization]]";
	}
	
	return $categories;
}
function getList(){
	$txt = getRawText("http://en.wikisource.org/w/index.php?action=raw&title=User:Slaporte/Automated_categorization/list");
	$txt = str_replace("<nowiki>\n","",$txt);
	$txt = str_replace("<pre>\n","",$txt);
	$txt = str_replace("\n</nowiki>","",$txt);
	$txt = str_replace("\n</pre>","",$txt);
	$list = explode("\n",$txt);
	foreach($list as $k=>$item) {
		$pair[$k] = explode("=>",$item);
		if(isset($pair[$k][1])){
			$pair[$k][0] = trim($pair[$k][0]);
			$pair[$k][1] = trim($pair[$k][1]);
		}
		$cats[$pair[$k][0]] = $pair[$k][1];
	}
	return $cats;
}
function displayCats($cats){
	foreach($cats as $cat){
		print $cat."\n";
	}
}
if(isset($_POST["PageName"])){
	$page = $_POST["PageName"];
}
?>
<html>
<head><title>CategoryGuesser</title>
</head>
<body>
<div id=main>
<h1>Court Case Category Suggestion Tool</h1>
<form method="post" action="categoryguess.php">
<label for="PageName">Page title:</label>
<input type="text" name="PageName" value="<?PHP if(isset($page)){print $page;} ?>">
<div class="subtitle"><p>Enter the title of the page on wikisource, such as <i>International Shoe v. State of Washington</i></p></div>
<button type="submit" value="Submit" id="find">Submit</button>
<br />
<div id='results'>
<?php
if(isset($_POST["PageName"])){
	$page = $_POST["PageName"];
	$page = str_replace(" ","_",$page);
	$url = "http://en.wikisource.org/w/index.php?action=raw&title=".$page;
	print "<br/><label for='cats'>Suggested Categories:</label><br/><br/>";
	print "<textarea cols=70 rows=30 name='cats'>";
	displayCats(categoryGuess(getRawText($url),getlist()));
	print "</textarea>";
}
?>
</div>
</form>
</div>
<p><a href="http://en.wikisource.org/wiki/User:Slaporte/Automated_categorization/list">Add or edit</a> category suggestions (live!). <a href="http://en.wikisource.org/wiki/User:Slaporte/Automated_categorization">source code and documentation</a> available.</p><p>leave <a href="http://en.wikisource.org/wiki/User_talk:Slaporte/Automated_categorization">feedback</a>.</p>
</body>
</html>