h1. Working plan
h2. Language detection
Pig example
{code}
-- Simple language detection example
register 'target/warcbase-0.1.0-SNAPSHOT-fatjar.jar';
DEFINE ArcLoader org.warcbase.pig.ArcLoader();
DEFINE ExtractRawText org.warcbase.pig.piggybank.ExtractRawText();
DEFINE DetectLanguage org.warcbase.pig.piggybank.DetectLanguage();
raw = load 'arcfile.arc' using ArcLoader() as (url: chararray, date:chararray, mime:chararray, content:chararray);
b = foreach raw generate url, mime, ExtractRawText(content) as content;
c1 = foreach b generate DetectLanguage(content) as lang;
d = group c1 by lang;
g = foreach d generate group, COUNT(c1);
store e into 'e';
-- dump e;
{code}
For now clone [https://github.com/cneud/warcbase] and checkout the pig-integration branch. Running the unit tests will run the above Pig Latin script on the provided test gzip'ed ARC file. The language distribution reported by Tika is:
{code}
[ca, 1]
[en, 68]
[et, 8]
[hu, 34]
[it, 3]
[lt, 143]
[no, 35]
[pt, 2]
[ro, 6]
{code}
The UDF is added to the piggy bank of the warcbase project by adding this class that leverages Tika
{code}
package org.warcbase.pig.piggybank;
import org.apache.pig.EvalFunc;
import org.apache.pig.data.Tuple;
import org.apache.tika.language.LanguageIdentifier;
import java.io.IOException;
public class DetectLanguage extends EvalFunc<String> {
@Override
public String exec(Tuple input) throws IOException {
if (input == null || input.size() == 0 || input.get(0) == null) {
return null;
}
String text = (String) input.get(0);
return new LanguageIdentifier(text).getLanguage();
}
}
{code}
h2. MIME type detection
{code}
package org.warcbase.pig.piggybank;
import org.apache.pig.EvalFunc;
import org.apache.pig.data.Tuple;
import org.apache.tika.Tika;
import org.apache.tika.detect.DefaultDetector;
import java.io.IOException;
import java.io.InputStream;
import java.io.ByteArrayInputStream;
import org.apache.tika.parser.AutoDetectParser;
public class DetectMimeType extends EvalFunc<String> {
@Override
public String exec(Tuple input) throws IOException {
if (input == null || input.size() == 0 || input.get(0) == null) {
return null;
}
String content = (String) input.get(0);
InputStream is = new ByteArrayInputStream( content.getBytes() );
DefaultDetector detector = new DefaultDetector();
AutoDetectParser parser = new AutoDetectParser(detector);
return new Tika(detector, parser).detect(is);
}
}
{code}
h1. Goals
Create UDFs for
* language detection using Tika
* identification using Tika
h1. Pig scripts
Combined script for mime type and language detection
{code}
-- Combined mime type check and language detection on an arc file
register 'target/warcbase-0.1.0-SNAPSHOT-fatjar.jar';
-- Load arc file properties: url, date, mime, content
raw = load '/tmp/IAH-20080430204825-00000-blackbook.arc.gz' using
org.warcbase.pig.ArcLoader() as (url: chararray, date:chararray, mime:chararray, content:chararray);
-- Detect the mime type of the content using tika
a = foreach raw generate url,mime,content, org.warcbase.pig.piggybank.DetectMimeType(content) as tikaMime;
-- Select the textual files
b = filter a by (tikaMime matches 'text.*');
-- Strip the tags from the content
c = foreach b generate url,mime,tikaMime,org.warcbase.pig.piggybank.ExtractRawText(content) as txt;
-- Use tika to identify the language of the textual content
d = foreach c generate url,mime,tikaMime,org.warcbase.pig.piggybank.DetectLanguage(txt) as lang,txt;
store d into 'tmp' using PigStorage();
{code}
Example output from above script: [^output.txt]
h2. Language detection
Pig example
{code}
-- Simple language detection example
register 'target/warcbase-0.1.0-SNAPSHOT-fatjar.jar';
DEFINE ArcLoader org.warcbase.pig.ArcLoader();
DEFINE ExtractRawText org.warcbase.pig.piggybank.ExtractRawText();
DEFINE DetectLanguage org.warcbase.pig.piggybank.DetectLanguage();
raw = load 'arcfile.arc' using ArcLoader() as (url: chararray, date:chararray, mime:chararray, content:chararray);
b = foreach raw generate url, mime, ExtractRawText(content) as content;
c1 = foreach b generate DetectLanguage(content) as lang;
d = group c1 by lang;
g = foreach d generate group, COUNT(c1);
store e into 'e';
-- dump e;
{code}
For now clone [https://github.com/cneud/warcbase] and checkout the pig-integration branch. Running the unit tests will run the above Pig Latin script on the provided test gzip'ed ARC file. The language distribution reported by Tika is:
{code}
[ca, 1]
[en, 68]
[et, 8]
[hu, 34]
[it, 3]
[lt, 143]
[no, 35]
[pt, 2]
[ro, 6]
{code}
The UDF is added to the piggy bank of the warcbase project by adding this class that leverages Tika
{code}
package org.warcbase.pig.piggybank;
import org.apache.pig.EvalFunc;
import org.apache.pig.data.Tuple;
import org.apache.tika.language.LanguageIdentifier;
import java.io.IOException;
public class DetectLanguage extends EvalFunc<String> {
@Override
public String exec(Tuple input) throws IOException {
if (input == null || input.size() == 0 || input.get(0) == null) {
return null;
}
String text = (String) input.get(0);
return new LanguageIdentifier(text).getLanguage();
}
}
{code}
h2. MIME type detection
{code}
package org.warcbase.pig.piggybank;
import org.apache.pig.EvalFunc;
import org.apache.pig.data.Tuple;
import org.apache.tika.Tika;
import org.apache.tika.detect.DefaultDetector;
import java.io.IOException;
import java.io.InputStream;
import java.io.ByteArrayInputStream;
import org.apache.tika.parser.AutoDetectParser;
public class DetectMimeType extends EvalFunc<String> {
@Override
public String exec(Tuple input) throws IOException {
if (input == null || input.size() == 0 || input.get(0) == null) {
return null;
}
String content = (String) input.get(0);
InputStream is = new ByteArrayInputStream( content.getBytes() );
DefaultDetector detector = new DefaultDetector();
AutoDetectParser parser = new AutoDetectParser(detector);
return new Tika(detector, parser).detect(is);
}
}
{code}
h1. Goals
Create UDFs for
* language detection using Tika
* identification using Tika
h1. Pig scripts
Combined script for mime type and language detection
{code}
-- Combined mime type check and language detection on an arc file
register 'target/warcbase-0.1.0-SNAPSHOT-fatjar.jar';
-- Load arc file properties: url, date, mime, content
raw = load '/tmp/IAH-20080430204825-00000-blackbook.arc.gz' using
org.warcbase.pig.ArcLoader() as (url: chararray, date:chararray, mime:chararray, content:chararray);
-- Detect the mime type of the content using tika
a = foreach raw generate url,mime,content, org.warcbase.pig.piggybank.DetectMimeType(content) as tikaMime;
-- Select the textual files
b = filter a by (tikaMime matches 'text.*');
-- Strip the tags from the content
c = foreach b generate url,mime,tikaMime,org.warcbase.pig.piggybank.ExtractRawText(content) as txt;
-- Use tika to identify the language of the textual content
d = foreach c generate url,mime,tikaMime,org.warcbase.pig.piggybank.DetectLanguage(txt) as lang,txt;
store d into 'tmp' using PigStorage();
{code}
Example output from above script: [^output.txt]