User:IMSoP/parse t t.pl
|
This is a perl script I made to eliminate formatting inconsistencies, and fill in missing information, in Twin towns. Specifically, it parses every listing to make sure that there are entries for both ends of the twinning (until I made it, more than 50% of the entries were under UK, which is of course impossible; they are all twinned with somewhere, which is therefore twinned back with them).
#!/usr/bin/perl -w
use strict;
use Data::Dumper;
my (%countries, %uk_regions);
my ($head_country, $head_region, $head_town, $list); #where region == "county", but that would just look like "country"...
my ($found_town, $found_country, $extra_info);
my ($this_town_hash);
#partial regexes:
my $placeX = qr/(?<!\()\[\[[^\]]+\]\]/; #assume placename is all one link: [[+(many not-]s)+]]
my $countryX = qr/(?:\[\[)?([\w\s]+)(?:\]\])?\s?/; #can't assume that here though, and we'd rather lose the brackets
#clusters an instance of (a place, a country, and everything_else)
my $place_list = qr/($placeX)(?:, in |, | in )$countryX(.*?)(?:- |; |$)/;
#note that $countryX has capturing built-in, whereas $placeX doesn't need to
#step 1: build a nested structure of all towns listed
while(<>) {
if (/^==\[\[(.*?)\]\]==/) {
$head_country = $1;
&trim_ws($head_country);
unless ($countries{$head_country}) { #if we haven't dealt with this country before
$countries{$head_country}={}; #create an anonymous hash to contain its details
};
next
};
if (/^===(\[\[.*?\]\])===/) {
$head_region = $1;
&trim_ws($head_region);
$uk_regions{$head_region} = ();
next
}
#first, split into main town and twins
if (/^\*(\[\[.*?\]\])(?:: | - )(.*?)$/) {
$head_town = $1;
$this_town_hash = ( \%{$countries{$head_country}->{$head_town}} or \%{()} ); #anonymous hash to represent the town's twins
if ($head_country eq "UK") {push @{$uk_regions{$head_region}}, $head_town};
#now find the individual parts of the list
$list = $2;
while($list =~ /$place_list/g) {
($found_town, $found_country, $extra_info) = ($1, $2, $3);
&trim_ws($found_town, $found_country, $extra_info);
#add the new-found twin to the hash for the current head_town
$this_town_hash->{$found_town} = [$found_country, $extra_info];
#now find the "missing matches"...
unless ($countries{$found_country}->{$found_town}->{$head_town}) {
$countries{$found_country}->{$found_town}->{$head_town} = [$head_country, ''];
}
}
}
}
#$Data::Dumper::Terse = 1;
#print Data::Dumper->Dump([%countries, %uk_regions], []);
foreach $head_country (sort keys %countries) {
print "==[[$head_country]]==\n";
if($head_country eq "UK") {
#special case code to split UK by region
foreach $head_region (sort keys %uk_regions) {
print "===$head_region===\n";
foreach $head_town (sort @{$uk_regions{$head_region}}) {
&print_town_entry($head_town, $head_country);
delete $countries{$head_country}->{$head_town};
}
print "\n"; #at end of region
}
} else {
#general code for everywhere else (un-regioned)
foreach $head_town (sort keys %{$countries{$head_country}}) {
&print_town_entry($head_town, $head_country);
}
}
print "\n\n"; #at end of country
}
#dump any places in the UK that were never found a home for in the region hierarchy
$Data::Dumper::Terse = 1;
print Data::Dumper->Dump([%{$countries{"UK"}}], []);
exit;
sub print_town_entry {
($head_town, $head_country) = @_;
$list = "*$head_town - ";
foreach $found_town (sort keys %{$countries{$head_country}->{$head_town}}) {
$list .= "$found_town, $countries{$head_country}->{$head_town}->{$found_town}->[0]";
$list .= " ($countries{$head_country}->{$head_town}->{$found_town}->[1])"
if $countries{$head_country}->{$head_town}->{$found_town}->[1];
$list .= "; ";
}
$list =~ s/; $/\n/; #trim off the last (trailing) "; ", and put a newline instead
print $list;
}
sub trim_ws {
foreach(@_) {s/^\s*(.*?)\s*$/$1/;}
}