#!/usr/bin/perl -w
use utf8;
$lws = 5; # length of word sequence
$lang = "de"; # en, de
$mkImg = 1; # 1: make history image, 0: don't
$histDir = "histcache"; # where to save the revision histories
$wiki = "de.wikipedia.org"; # can be changed with e.g. "-w de.wikibooks.org"
$imgUrlDir = "http://localhost/hauptautoren";
if($^O eq "linux") {
$imgFileDir = "/var/www/html/hauptautoren";
$netpbmPath = ""; # pnmtopng etc. should be in the system path
} else { # Windows
$imgFileDir = "c:\\Programme\\Apache Software Foundation\\Apache2.2\\html\\hauptautoren";
$netpbmPath = "c:\\Programme\\GnuWin32\\bin";
}
########## end of configuration #################################
use Digest::MD5 qw(md5_hex);
use Compress::Zlib;
use Encode;
use CGI qw(:standard);
use LWP;
$ua = LWP::UserAgent->new;
use open ":utf8"; binmode STDOUT, ":utf8";
use Cwd;
use File::Spec::Functions;
use Getopt::Std;
getopts('w:');
if(defined $opt_w) {
$wiki = $opt_w;
}
if ($lang eq "en") {
$category = "Category";
$image = "Image";
$words = "words";
$fraction = "fraction";
$user = "user";
} elsif ($lang eq "de") {
$category = "Kategorie";
$image = "Bild";
$words = "Wörter";
$fraction = "Anteil";
$user = "Benutzer";
} else {
die;
}
$| = 1;
$stop = 0;
$SIG{INT} = sub { $stop = 1 };
$SIG{PIPE} = sub { $stop = 1 };
$sNr = 0; $nWordsMax = 0;
import_names;
if(defined $ARGV) {
$title = $ARGV;
$title =~ s/ /_/g;
$title_md5 = md5_hex "$title";
$cgi = 0;
$imgFileDir = cwd;
$imgFile = catfile($imgFileDir, "$title_md5.png");
$imgUrl = "file:$imgFile";
} elsif(defined $Q::page) {
$title = $Q::page;
$title =~ s/ /_/g;
$title_md5 = md5_hex "$title";
$cgi = 1;
print header(-charset => 'utf-8');
print "<!--\n";
mkdir $imgFileDir if $mkImg && !-d $imgFileDir;
$imgFile = catfile($imgFileDir, "hf.png");
$imgUrl = "$imgUrlDir/hf.png";
} else {
die;
}
$allowedTags = 'b|big|blockquote|br|caption|center|cite|code|dd|del|div|dl|'.
'dt|em|font|h1|h2|h3|h4|h5|h6|hr|i|ins|li|nowiki|ol|p|pre|ref|'.
'references|rb|rp|rt|ruby|s|small|span|strike|strong|sub|sup|table|'.
'td|th|tr|tt|u|ul|var';
sub progress {
print $_;
}
$subdir = substr $title_md5, 0, 2;
$idLast = -1;
if(-f catfile($histDir, $wiki, $subdir, "$title_md5.seq")) {
progress("Loading cached sequences ... ");
open SEQ, catfile($histDir, $wiki, $subdir, "$title_md5.seq");
while(<SEQ>) {
chop;
if(/^# (\d+)$/) {
$idLast = $1;
} else {
@wordsIds = split;
$seq = join(" ", @wordsIds);
$id{$seq} = ) ];
}
}
close SEQ;
progress("done.\n");
}
if($mkImg && -f catfile($histDir, $wiki, $subdir, "$title_md5.idh")) {
progress("Loading cached author attribution info ... ");
open IDH, catfile($histDir, $wiki, $subdir, "$title_md5.idh");
binmode IDH;
while(!eof(IDH)) {
read IDH, $tmp, 4;
$id = unpack('V', $tmp);
read IDH, $tmp, 4;
$nIdh = unpack('V', $tmp);
push @nWords, $nIdh;
$nWordsMax = $nIdh if $nIdh>$nWordsMax;
read IDH, $tmp, 4;
$idhBinGzLen = unpack('V', $tmp);
read IDH, $idhBinGz, $idhBinGzLen;
($gz, $status) = inflateInit(-WindowBits => 0 - MAX_WBITS);
($idhBin, $status) = $gz->inflate($idhBinGz);
$status==Z_STREAM_END or die $gz->msg();
@idh = unpack("V", $idhBin);
push @idHist, ;
}
close IDH;
progress("done.\n");
if(int(@idHist) != $idLast+1) {
progress("Cache corrupted, must reanalyze.\n");
unlink catfile($histDir, $wiki, $subdir, "$title_md5.seq");
unlink catfile($histDir, $wiki, $subdir, "$title_md5.idh");
%id = (); $idLast = -1;
@nWords = (); @idHist = ();
}
}
open RI, "perl loadhistory -w $wiki '$title' |";
$id=0;
progress("Analyzing history ...\n");
#open DBG, ">md5.txt";
while($revInfo = <RI>) {
if($revInfo =~ /^# (.*)/) {
$msg = $1;
progress "\t$msg\n";
if($msg =~ /^Error/) {
close RI;
exit;
}
next;
}
$revInfo0 = $revInfo;
$revInfo0 =~ /user="(.*?)"/;
if($id % 20 == 0) {
$revInfo0 =~ /timestamp="(....)-(..)-(..)T(..):(..):(..)Z"/;
progress("$1$2$3$4$5$6\n");
}
$author = $1;
if($id<=$idLast && (!$mkImg || defined $idHist)) {
$id++;
next;
} else {
analyzeText();
$id++;
}
last if $stop;
}
#close DBG;
$id--;
analyzeText() if !defined $text0;
close RI;
if($stop) {
progress("interrupted.\n");
exit;
} else {
progress("done.\n");
}
sub analyzeText {
$revInfo0 =~ /pos=(\d+) len=(\d+)/;
$pos = $1; $len = $2;
open TXT, catfile($histDir, $wiki, $subdir, "$title_md5.txt");
binmode TXT;
seek TXT, $pos, 0;
read TXT, $textGz, $len;
($gz, $status) = inflateInit(-WindowBits => 0 - MAX_WBITS);
($text, $status) = $gz->inflate($textGz);
$status==Z_STREAM_END or die $gz->msg();
close TXT;
# print DBG md5_hex("$text"), "\n";
$text = Encode::decode_utf8($text);
# convert <, >, & and remove (inter-)wikilinks
$text =~ s/</</sg; $text =~ s/>/>/sg; $text =~ s/&/&/sg;
$text =~ s/\\]//sg;
$text =~ s/\\|]*?\|)?(+?)\]\]/$2/sg;
$text =~ s/\n{3,}/\n\n/sg;
$text0 = $text;
# remove elements not to be colored
while($text =~ s/\{\{((?!\{\{).)*?\}\}/ /sg) {}
while($text =~ s/\{\|((?!\{\|).)*?\|\}/ /sg) {}
$text =~ s/\\]/ /isg;
$text =~ s/\*? ?\/ /isg;
$text =~ s/\*? ?(http|ftp|mailto):\S*/ /isg;
$text =~ s/<math>.*?<\/math>/ /sg;
$text =~ s/<\/?($allowedTags)(\s+.{1,200}?)?\/?>/ /sg;
$text =~ s/&(\w+|#(\d+|x+));/ /sg;
@words = ();
while ($text =~ /+/sg) {
push @words, $&;
}
$nWords = @words;
$nWordsMax = @words>$nWordsMax?@words:$nWordsMax;
@id = ();
for ($i=0; $i<@words; $i++) {
$id = $id;
}
for ($i=0; $i<@words-$lws+1; $i++) {
$seq = join(" ", @words);
if (defined $id{$seq}) {
for ($j=$i; $j<$i+$lws; $j++) {
if ($id>$id{$seq}) {
$id = $id{$seq};
}
}
}
}
my %idNew = ();
for ($i=0; $i<@words-$lws+1; $i++) {
$seq = join(" ", @words);
if (!defined $id{$seq}) {
for ($j=$i; $j<$i+$lws; $j++) {
$id{$seq} = $id;
$idNew{$seq} = $id;
}
}
}
open SEQ, ">>" . catfile($histDir, $wiki, $subdir, "$title_md5.seq");
print SEQ "# $id\n";
foreach $seq (keys %idNew) {
print SEQ $seq, " ", join(",", @{$idNew{$seq}}), "\n";
}
close SEQ;
if($mkImg && !defined $idHist) {
$idHist = ;
open IDH, ">>".catfile($histDir, $wiki, $subdir, "$title_md5.idh");
binmode IDH;
#print int(@id), "\n";
print IDH pack('V', $id);
print IDH pack('V', int(@id));
$idhBin = pack('V', @id);
($gz, $status) = deflateInit(-WindowBits => 0 - MAX_WBITS);
($idhBinGz1, $status) = $gz->deflate($idhBin);
($idhBinGz2, $status) = $gz->flush();
$idhBinGz = $idhBinGz1 . $idhBinGz2;
print IDH pack('V', do { use bytes; length $idhBinGz });
print IDH $idhBinGz;
close IDH;
}
}
for ($i=0; $i<@words; $i++) {
$words{$author]}++;
}
@authors = sort {$words{$b} <=> $words{$a}} keys %words;
for ($i=0; $i<@authors; $i++) {
if ($i>5) {
$color{$authors} = "#000000";
$colorImg{$authors} = "\x00\x00\x00" if $mkImg;
} else {
$color{$authors} = ("#bf0000", "#00bf00", "#0000bf", "#007f7f",
"#7f007f", "#7f7f00");
$colorImg{$authors} = ("\xbf\x00\x00", "\x00\xbf\x00", "\x00\x00\xbf", "\x00\x7f\x7f",
"\x7f\x00\x7f", "\x7f\x7f\x00") if $mkImg;
}
}
# compute history image
if ($mkImg) {
progress("Computing image ...\n");
open IMG, ">hf_tmp.ppm";
binmode IMG;
printf IMG "P6 %d %d 255\n", $id+1, $nWordsMax;
for ($y=0; $y<$nWordsMax && !$stop; $y++) {
print "$y/$nWordsMax\n" if $y%100==0;
for ($x=0; $x<=$id; $x++) {
if ($y<$nWords) {
if (!defined $colorImg{$author]}) {
print IMG "\x00\x00\x00";
} else {
print IMG $colorImg{$author]};
}
} else {
print IMG "\xff\xff\xff";
}
}
}
close IMG;
if(!$stop) {
if(defined $netpbmPath && $netpbmPath ne "") {
$pamscale = catfile($netpbmPath, "pamscale");
$pnmtopng = catfile($netpbmPath, "pnmtopng");
} else {
$pamscale = "pamscale";
$pnmtopng = "pnmtopng";
}
system "$pamscale -width 400 -height 400 hf_tmp.ppm > hf_tmp2.ppm";
system "$pnmtopng hf_tmp2.ppm > $imgFile";
}
unlink "hf_tmp.ppm", "hf_tmp2.ppm";
exit if $stop;
progress("done.\n");
}
# mask elements not to be colored
sub subst {
my $s = "___".$sNr++."___";
$substBlock{$s} = $_;
$s;
}
while($text0 =~ s/\{\{((?!\{\{).)*?\}\}/subst($&)/esg) {}
while($text0 =~ s/\{\|((?!\{\|).)*?\|\}/subst($&)/esg) {}
$text0 =~ s/\\]/subst($&)/iesg;
$text0 =~ s/\*? ?\/subst($&)/iesg;
$text0 =~ s/\*? ?(http|ftp|mailto):\S*/subst($&)/iesg;
$text0 =~ s/<math>.*?<\/math>/subst($&)/esg;
$text0 =~ s/<\/?($allowedTags)(\s+.{1,200}?)?\/?>/subst($&)/esg;
$text0 =~ s/&(\w+|#(\d+|x+));/subst($&)/esg;
# color the text
for ($i=0; $i<@words; $i++) {
$text0 =~ s/^(.*?)$words//sg;
$gap = $1;
if ($i==0) {
$coloredText = "$gap<font color=\"$color{$author]}\">$words"
} else {
if ($gap =~ /^\s+$/ && $author] eq $author]) {
$coloredText .= "$gap$words"
} else {
$coloredText .= "</font>$gap<font color=\"$color{$author]}\">$words"
}
}
}
$coloredText .= "</font>$text0";
# fetch back masked elements
while ($coloredText =~ s/___\d+___/$substBlock{$&}/sg) {}
$stats = "";
if($mkImg) {
$stats .= "<table cellspacing=\"10\">\n";
$stats .= "<tr valign=\"top\"><td>\n";
}
$stats .= "<table cellspacing=\"0\" border=\"1\">\n";
$stats .= "<tr><th>$words</th><th>$fraction</th><th>$user</th></tr>\n";
for($i=0; $i<@authors; $i++) {
$author = $authors;
$stats .= sprintf "<tr><td>%5i</td><td>%4.1f%%</td><td><a href=\"http://$wikihttps://wiki95.com/de/$user:%s\" style=\"color:%s\">%s</a></td></tr>\n",
$words{$author}, 100*$words{$author}/@words, $author, $color{$author}, $author;
if($i<6) {
$statsShort .= sprintf "<a href=\"http://$wikihttps://wiki95.com/de/$user:%s\" style=\"color:%s\">%s</a> (%d)%s\n",
$author, $color{$author}, $author, $words{$author}, $i<5?";":"";
}
}
$stats .= "</table>\n";
if ($mkImg) {
$stats .= "</td><td><img src=\"$imgUrl\"></td></tr>\n";
$stats .= "</table>\n";
}
#open DBG, ">hf-debug.txt";
#print DBG $coloredText;
#close DBG;
exit if $stop;
progress("Sending preview request to $wiki ... ");
$url = "http://$wiki/w/index.php?title=$title&action=submit";
$response = $ua->post( $url, [
wpTextbox1 => Encode::encode_utf8($coloredText),
wpPreview => "Vorschau zeigen",
]);
$html = $response->decoded_content;
if($html =~ /Quelltext betrachten/) {
progress("page is protected.\n");
exit if $stop;
progress("Sending another preview request to $wiki ... ");
$url = "http://$wiki/w/index.php?title=${title}_tmp&action=submit";
$response = $ua->post( $url, [
wpTextbox1 => Encode::encode_utf8($coloredText),
wpPreview => "Vorschau zeigen",
]);
$html = $response->decoded_content;
}
progress("done.\n");
exit if $stop;
$html =~ s/<head>/<head><base href="http:\/\/$wiki" \/>/s;
$html =~ s/<title>Bearbeiten von (.*?) - Vorschau - Wikipedia<\/title>/<title>$1 - Wikipedia<\/title>/s;
$html =~ s/<div class='previewnote'>.*?<\/div>//s;
$html =~ s/(<div id="wikiPreview">)<h2>.*?<\/h2>/$1/s;
$html =~ s/<h1 class="firstHeading">Bearbeiten von (.*?)<\/h1>/<h1 class="firstHeading">$1<\/h1>$statsShort<hr>/s;
$html =~ s/<p>Diese Seite ist \d+ kB groß\..*?<\/p>//s;
$html =~ s/<form id="editform".*?<\/form>/<p \/><hr \/><hr \/><p>$stats<\/p>/s;
if($cgi) {
print "-->\n";
print $html;
} else {
open OUT, ">$title.html";
print OUT $html;
close OUT;
}