View Source

Pig example to "parse" METS on the Hadoop Hackathon virtual box instance

{code}
register /home/scape/piggybank-0.12.0.jar;
xml_file = LOAD '/home/scape/ONB_+Z119574606.xml' using org.apache.pig.piggybank.storage.XMLLoader('gbs:problem_pages') as (mypages);
pages_xml = foreach xml_file generate mypages;
page_numbers1 = foreach pages_xml generate REPLACE(mypages, '</gbs:page>\n(.*)<gbs:page>', '-') as mypages;
page_numbers2 = foreach page_numbers1 generate REPLACE(mypages, '<gbs:problem_pages>', '') as mypages;
page_numbers3 = foreach page_numbers2 generate REPLACE(mypages, '</gbs:problem_pages>', '') as mypages;
page_numbers4 = foreach page_numbers3 generate REPLACE(mypages, '<gbs:page>', '') as mypages;
page_numbers5 = foreach page_numbers4 generate REPLACE(mypages, '</gbs:page>', '') as mypages;
page_numbers6 = foreach page_numbers5 generate TRIM(mypages) as mypages;
page_numbers7 = foreach page_numbers6 generate STRSPLIT(mypages, '-', 0) as mypages;
page_numbers8 = foreach page_numbers7 generate FLATTEN(mypages);
--store the page numbers of the problem pages in an output file
store page_numbers8 into '/home/scape/out';
{code}