compared with
This line was removed.
This word was removed. This word was added.
This line was added.

Changes (6)

View Page History

-- Combined mime type check and language detection on an arc file
register 'target/warcbase-0.1.0-SNAPSHOT-fatjar.jar';

-- Load arc file properties: url, date, mime, content
raw = load '/tmp/IAH-20080430204825-00000-blackbook.arc.gz' using
org.warcbase.pig.ArcLoader() as (url: chararray, date:chararray, mime:chararray, content:chararray);

-- Detect the mime type of the content using tika
a = foreach raw generate url,mime,content, org.warcbase.pig.piggybank.DetectMimeType(content) as tikaMime;

-- Select the textual files
b = filter a by (tikaMime matches 'text.*');

-- Strip the tags from the content
c = foreach b generate url,mime,tikaMime,org.warcbase.pig.piggybank.ExtractRawText(content) as txt;

-- Use tika to identify the language of the textual content
d = foreach c generate url,mime,tikaMime,org.warcbase.pig.piggybank.DetectLanguage(txt) as lang,txt;