# Count crawl dates using Hadoop hadoop jar target/warcbase-0.1.0-SNAPSHOT-fatjar.jar \ org.warcbase.analysis.CountArcCrawlDates \ -input /shared/collections/arc/congress_108th/c108th_1/CONGRESS-08-20040823230037-00000-crawling002.archive.org.arc.gz -output tmp ----- -- Demo dumping raw ARC files register 'target/warcbase-0.1.0-SNAPSHOT-fatjar.jar'; raw = load '/shared/collections/arc/congress_108th/c108th_1/CONGRESS-08-20040823230037-00000-crawling002.archive.org.arc.gz' using org.warcbase.pig.ArcLoader() as (url: chararray, date:chararray, mime:chararray, content:chararray); store raw into 'tmp-pig1' using PigStorage(); ----- -- Count crawl dates using Pig register 'target/warcbase-0.1.0-SNAPSHOT-fatjar.jar'; raw = load '/shared/collections/arc/congress_108th/c108th_1/CONGRESS-08-20040823230037-00000-crawling002.archive.org.arc.gz' using org.warcbase.pig.ArcLoader() as (url: chararray, date:chararray, mime:chararray, content:chararray); a = foreach raw generate SUBSTRING(date, 0, 8) as date; b = group a by date; c = foreach b generate group, COUNT(a); store c into 'tmp-pig2' using PigStorage(); ---- -- Dump raw extraction of links register 'target/warcbase-0.1.0-SNAPSHOT-fatjar.jar'; raw = load '/shared/collections/arc/congress_108th/c108th_1/CONGRESS-08-20040823230037-00000-crawling002.archive.org.arc.gz' using org.warcbase.pig.ArcLoader() as (url: chararray, date:chararray, mime:chararray, content:chararray); a = foreach raw generate org.warcbase.pig.piggybank.ExtractLinks(content); store a into 'tmp-pig3/'; ---- -- Anchor text inversion register 'target/warcbase-0.1.0-SNAPSHOT-fatjar.jar'; raw = load '/shared/collections/arc/congress_108th/c108th_1/CONGRESS-08-20040823230037-00000-crawling002.archive.org.arc.gz' using org.warcbase.pig.ArcLoader() as (url: chararray, date:chararray, mime:chararray, content:chararray); a = foreach raw generate FLATTEN(org.warcbase.pig.piggybank.ExtractLinks(content)); b = group a by $0; c = foreach b generate $0 as url, COUNT(a) as cnt, $1 as anchor; d = order c by cnt desc; store d into 'tmp-pig4/';