User:Eloquence/Wikinfo import script
|
Copy this from the source (http://en.wikipedia.org/w/wiki.phtml?title=User:Eloquence/Wikinfo_import_script&action=edit), not from here.
This is the first pre-release of the new import script. It runs as a webserver on 8450 and does some cool stuff:
- Filters Special:Newpages for new articles that are not "from Wikipedia"
- Does not require a cookies.txt - uses its own account. Works with Wikinfo's new login requirement
- Auto-generates diffs of new articles that exist in both Wikinfo and Wikipedia
- Strips signatures from titles when importing
Note that to do all this it has to load quite a few pages, which takes some time for a full set of 500 new pages.
To do:
- Cache previous runs in wikinfo.db so we don't have to wait 60 seconds
- Use Special:Export
If you want to use this in some way or another, make sure you install all the used modules first using CPAN.
#!/usr/bin/perl use LWP::UserAgent; use HTTP::Cookies; use HTTP::Daemon; use HTTP::Status; use HTTP::Response; use URI::Escape; use Text::ParagraphDiff; use GDBM_File ; tie %storage, 'GDBM_File', "wikinfo.db", &GDBM_WRCREAT, 0640; $SIG{INT} = \&catch_zap; # best strategy $WKPREFIX="WIKINFO_"; $WKSPREFIX="WIKINFOSIZE_"; $LCPREFIX="LASTCHECK_"; $WPPREFIX="WIKIPEDIA_"; $DIPREFIX="DIFF_"; $browser=LWP::UserAgent->new(); $browser->cookie_jar( {} ); @ns_headers = ( 'User-Agent' => 'Mozilla/4.76 [en] (Win98; U)', 'Accept' => 'image/gif, image/x-xbitmap, image/jpeg, image/pjpeg, image/png, */*', 'Accept-Charset' => 'iso-8859-1,*,utf-8', 'Accept-Language' => 'en-US', ); $browser->post("http://www.wikinfo.org/wiki.phtml?title=Special:Userlogin&action=submit",@ns_headers,Content=>[wpName=>"Testuser",wpPassword=>"testpass",wpRemember=>"1",wpLoginAttempt=>"LOG IN"]); $browser->post("http://en.wikipedia.org/w/wiki.phtml?title=Special:Userlogin&action=submit",@ns_headers,Content=>[wpName=>"Testuser",wpPassword=>"testpass",wpRemember=>"1",wpLoginAttempt=>"LOG IN"]); $d=new HTTP::Daemon(LocalHost=>'localhost', LocalPort => '8450', Reuse=>1); print "Please contact me at: ".$d->url. "\n"; @ns_headers = ( 'User-Agent' => 'Mozilla/4.76 [en] (Win98; U)', 'Accept' => 'image/gif, image/x-xbitmap, image/jpeg, image/pjpeg, image/png, */*', 'Accept-Charset' => 'iso-8859-1,*,utf-8', 'Accept-Language' => 'en-US', ); #get_wikinfo_new(); #exit 0; while ($c = $d->accept) { $r = $c->get_request; my $html; $html.= <<HTML ; <html> <head> <style TYPE="text/css"> <!-- body { margin-left:2em;margin-right:2em;background:#eeeeee;} a { text-decoration:none;color:blue;} a.ext { color:green;cursor:help; } --> </STYLE> <body> <h1>Wikinfo Import Script</h1> HTML if ($r) { if ($r->method eq 'GET' and $r->url->path eq "/") { my $re=new HTTP::Response(); $re->header("content_type"=>"text/html"); $html.= <<HTML ; <table border="1" width="100%"> <tr><td><b>Wikinfo page</B></td><td><b>Corresponding Wikipedia page</B></td><td><b>Import</B></td></tr> HTML my @lines=get_wikinfo_new(); while(@lines) { $linkopen=shift @lines; $linktitle=shift @lines; $linkclose=shift @lines; $bytes=shift @lines; $comment=shift @lines; $wikipedia=shift @lines; $diff=shift @lines; $import=shift @lines; $html.="<tr valign='top'><td>".$linkopen .$linktitle. $linkclose ." (".$bytes." bytes)"; if($comment) { $html.="<br><I>$comment</I>";} $html.="</td><td>$wikipedia</td><td>$import</td></tr>"; if($diff ne "N/A") { $html .= "<tr><td colspan=3 bgcolor=\"#dddddd\"><b>Diff:</B><P><font size=-1>". "$diff</font></td></tr>"; } } $html.= <<HTML ; </table> </body> </html> HTML $re->content($html); $c->send_response($re); } elsif($r->method eq 'GET' and $r->url->path ne "/") { my $re=new HTTP::Response(); $re->header("content_type"=>"text/html"); $page=substr($r->url->path,1); $html.=import_wikinfo($page); $html.="</body></html>"; $re->content($html); $c->send_response($re); } else { $c->send_error(RC_FORBIDDEN) } } $c = undef; # close connection } sub get_wikinfo_new { my $response = $browser->get( "http://www.wikinfo.org/wiki.phtml?title=Special:Newpages&limit=500&offset=0", @ns_headers); $response->content=~m/<ol start=.*?>(.*?)<\/ol>/s; @lines=split(/<LI>/i,$1); print $#lines; my @checklines; foreach $line(@lines) { if($line=~m/(.*?)(<a href.*?>)(.*?)(<\/a>).*?\((.*?) bytes\)/i) { $date=$1; $linkopen=$2; $linktitle=$3; $linkclose=$4; $bytes=$5; if($line=~m/<em>\((.*)\)<\/em>/i) { $comment=$1; } else { $comment=""; } $wikititle=to_url($linktitle); $pediatitle=to_url(strip_sig($linktitle)); if(!($comment=~m/from wikipedia \(note changes here\)/i)) { push @checklines,$linkopen; push @checklines,$linktitle; push @checklines,$linkclose; push @checklines,$bytes; push @checklines,$comment; $tryurl="http://en.wikipedia.org/w/wiki.phtml?title=".$pediatitle . "&action=edit"; $response=$browser->get($tryurl,@ns_headers); $response->content=~m/<textarea.*?>(.*)<\/textarea>/is; $pediasource=$1; if(($pediasource=~m/\w+/)) { push @checklines,"<a href='$tryurl'>$pediatitle</a>"; if(1) { $tryurl="http://www.wikinfo.org/wiki.phtml?title=" . $wikititle . "&action=edit"; $response=$browser->get($tryurl,@ns_headers); $response->content=~m/<textarea.*?>(.*)<\/textarea>/is; $wikinfosource=$1; $diff=text_diff($pediasource, $wikinfosource, {string=>1, plain=>1, escape=>1}); $diff=~m/<p>(.*)<\/p>/si; $diff=$1; $diff=~s/ size="\+1">/>/gi; push @checklines, $diff; } else { push @checklines, "N/A"; } push @checklines, "N/A"; # exists, no import possible } else { push @checklines, "<a href='http://en.wikipedia.org/wiki/$pediatitle'>N/A</A>"; # no Wikipedia URL push @checklines, "N/A"; # no diff $importurl=$d->url.$wikititle; $importlink="<a href='$importurl'>Go!</a>"; push @checklines,$importlink; } } } } return @checklines; } sub import_wikinfo { my $title=shift; my $editurl="http://www.wikinfo.org/wiki.phtml?title=".$title."&action=edit"; my $viewurl="http://www.wikinfo.org/wiki.phtml?title=".$title; my $response = $browser->get($editurl,@ns_headers); my $rv; $pagetitle=to_wiki($title); $pediaurl=to_url(strip_sig($pagetitle)); # print "Full:\n".$response->content; $response->content=~m/<textarea.*?>(.*)<\/textarea>/is; # print "Source:\n".$source; $source=$1; $source=~s/\"/"/gi; # unescape $source=~s/\>/>/gi; $source=~s/\</</gi; $source=~s/\&/\&/gi; if(!($source=~m/\w+/)) { $rv.= "The page with the specified title was not found: <A HREF='$viewurl'>$viewurl</A> (<a href='$editurl'>edit</a>)"; return $rv; } $source.="\n\n''Adapted from the [[Wikinfo]] article [$viewurl $pagetitle], licensed under the [[GNU Free Documentation License]].''"; $rv.="Checking for duplicate of <A HREF='$viewurl'>$viewurl</A>..<P>"; $tryurl="http://en.wikipedia.org/wiki/".$pediaurl; $response=$browser->get($tryurl,@ns_headers); if($response->content=~m/There is currently no text in this page/) { $rv.="Posted new article to <a href='$tryurl'>$tryurl</A>!<P>"; $wpurl="http://en.wikipedia.org/w/wiki.phtml?title=".$pediaurl."&action=submit"; $browser->post($wpurl,@ns_headers,Content=> [ wpTextbox1=>$source, wpSave=>"Save page", wpSummary=>"Imported from Wikinfo via [[User:Eloquence/Wikinfo import script]]" ]); } else { $rv.="Page already exists on Wikipedia: <a href='$tryurl'>$tryurl</A>! You have to merge by hand. :-(" } return $rv; } sub catch_zap { my $signame = shift; untie %storage; die "Program terminated: Received $signame"; } sub strip_sig { my $title=shift; @names=( "Levan Urushadze", "Fred Bauder"); while ($name=shift(@names)) { $title=~s/(.*) by $name$/$1/g; } return $title; } sub to_url { my $title=shift; $title=~s/ /_/gi; $title=uri_escape($title); $title=~s/\'/\%27/gi; return $title; } sub to_wiki { my $title=shift; $title=uri_unescape($title); $title=~s/\%27/\'/gi; $title=~s/_/ /gi; return $title; }