]> git.sesse.net Git - ccbs/blob - parse/parse-wiki-countrylist.pl
Add scripts and example SQL for parsing the EM2 series from 2002.
[ccbs] / parse / parse-wiki-countrylist.pl
1 #! /usr/bin/perl
2 use strict;
3 use warnings;
4
5 # Parses country list from
6 # http://en.wikipedia.org/w/index.php?title=List_of_IOC_country_codes&action=edit
7
8 print "begin;\n";
9
10 while (<>) {
11         m/
12           ^ \* \s*
13             ( [A-Z]{3} )                  # country code
14             \s* - \s*
15             \[\[
16             ( ?: .*? \| ) ?               # optional article lookup
17             ( .*? )                       # country name
18             \]\]
19          /x or next;
20          
21         my ($countrycode, $countryname) = ($1, $2);
22
23         # fix some wikisyntax ickyness :-)
24         $countryname =~ s/\]\]//g;
25         $countryname =~ s/\[\[//g;
26
27         # minimal SQL escaping
28         $countryname =~ s/'/\\'/g;
29
30         printf "INSERT INTO countries (countryname,countrycode) VALUES ('%s','%s');\n",
31                 $countryname, $countrycode;
32 }
33
34 printf "commit;\n";