exploring the enron email dataset with kiji and hive

Post on 30-Jun-2015

1.082 Views

Category:

Technology

1 Downloads

Preview:

Click to see full reader

DESCRIPTION

Talk given at September 2013 SF Hadoop Users Group by Lee Sheng http://www.meetup.com/hadoopsf/events/136499862/

TRANSCRIPT

●○○○○

●●●

●●●

●●●

CREATE EXTERNAL TABLE emails ( mid STRUCT<ts: TIMESTAMP, value: STRING>, dateLong STRUCT<ts: TIMESTAMP, value: BIGINT>, fromStr STRUCT<ts: TIMESTAMP, value: STRING>, toStr STRUCT<ts: TIMESTAMP, value: STRING>, subject STRUCT<ts: TIMESTAMP, value: STRING>, body STRUCT<ts: TIMESTAMP, value: STRING>,) STORED BY 'org.kiji.hive.KijiTableStorageHandler'WITH SERDEPROPERTIES ( 'kiji.columns' = ‘info:mid[0],info:date[0],info:from[0],info:to[0],’ + ‘info:subject[0],info:body[0]’) TBLPROPERTIES ( 'kiji.table.uri' = ' kiji://.env/enron_email/emails ');

SELECT

fromStr.value AS fromStr,

count(1) AS count

FROM emails

GROUP BY fromStr.value

ORDER BY count DESC

LIMIT 10;

SELECT fromStr.value AS fromStr, trim(splitToStr) AS toStr, count(1) AS countFROM emails LATERAL VIEW explode(split(toStr.value,',')) tos AS splitToStrGROUP BY fromStr.value,trim(splitToStr)ORDER BY count DESCLIMIT 10;

User Emails

Emails Table Sentiment

Producer

SELECT ((year(datelong.ts)-1999)*52+weekofyear(datelong.ts)) AS weeknum, avg(sentiment.value) AS avgsentiment, stddev(sentiment.value) AS stddevsentiment, count(1) AS nummessagesFROM emailsWHERE regexp_replace(fromStr.value,".*@","")=="enron.com" GROUP BY ((year(datelong.ts)-1999)*52+weekofyear(datelong.ts));

SELECT lword AS word, sum(sentiment) AS totalsentimentFROM ( SELECT mid.value AS mid, lower(word) AS lword, sentiment.value AS sentiment FROM emails LATERAL VIEW explode(sentences(body.value)[0]) wds AS word WHERE regexp_replace(fromStr.value,".*@","")=="enron.com") subqueryGROUP BY lwordORDER BY totalsentiment ASC;

top related