hadoop, hdfs, mapreduce and pig

●

●

●

●●●

●●●●●●●

●●●

●

●●●

●

●

●

●

●

●

●

●

●●

●

●

●

●

> hadoop fs

hadoop fs

●

●

●

●

●

●

●

●

●

$ hadoop fs

● ls

$ hadoop fs –help ls

●

$ hadoop fs –ls <path> $ hadoop fs –ls /

●

$ hadoop fs -ls $ hadoop fs –ls /user/cloudera

●

●

●

$ hadoop fs -mkdir data $ hadoop fs -ls

●

$ cd ~/bigdata/Exercises/hadoop/data $ ls -l $ hadoop fs –put mammograms.zip data

●

● http://localhost:50070

● fsck: an HDFS utility $ hadoop fsck /user/cloudera/data/mammograms.zip \

-blocks -locations -files

●

$ head -n 100 ato_centenary.txt \ | hadoop fs –put - data/ato100.txt

http://localhost:50070

●

$ head -n 1000 ato_centenary.txt \ | hadoop fs –put - data/ato100.txt

●

put: ‘data/ato100.txt': File exists●

$ hadoop fs -rm data/ato100.txt $ head -n 1000 ato_centenary.txt \ | hadoop fs –put - data/ato100.txt

●

$ hadoop fs -cat data/ato100.txt | less

●

$ hadoop fs -get data/ato100.txt ato100.txt

●

-mv, -cp, -rmdir, -stat ...

●

●●●●

●●

●

●

●

●

●

●

○

■

●○

●

○

●○

●○

○○

●

●

●

●

●

●

●

●

●

●

$ javac –classpath `hadoop classpath` *.java

●

$ jar cvf csiro.jar *.class

●

$ hadoop jar csiro.jar Csiro input_dir output_dir

●

○

●●

map(in_key, in_value) -> (inter_key, inter_value) list

●

○

■

■

■

●

●

let map(key, value) =emit(key.toUpper(), value.

toUpper())

(‘csiro’, ‘cci’) -> (‘CSIRO’, ‘CCI’)(‘csiro’, ‘cesre’) -> (‘CSIRO’, ‘CESRE’)(‘csiro’, ‘cmse’) -> (‘CSIRO’, ‘CMSE’)(‘toyota’, ‘yaris’) -> (‘TOYOTA’, ‘YARIS’)

●

let map(key, value) =foreach char c in value:

emit(key, c)

(‘cci’, ‘csiro’) -> (‘cci’, ‘c’), (‘cci’, ’s’),(‘cci’, ‘i’), (‘cci’, ‘r’),(‘cci’, ‘o’)

(‘open’, ‘nasa’) -> (‘open’, ‘n’), (‘open’, ’a’),(‘open’, ‘s’), (‘open’, ‘a’)

●

let map(key, value) =emit(value.length(), value)

(‘csiro’, ‘cci’) -> (‘3’, ‘cci’)(‘csiro’, ‘cesre’) -> (‘5’, ‘cesre’)(‘csiro’, ‘cmse’) -> (‘4’, ‘cmse’)(‘toyota’, ‘yaris’) -> (‘5’, ‘yaris’)

●

●○

○

○

●○

●

map(String input_key, String input_value)foreach word w in input_value:

emit(w, 1)

reduce(String output_key,Iterator<int> intermediate_values)

set count = 0foreach v in intermediate_values:

count += vemit(output_key, count)

● Wordcount $ cd ~/bigdata/Exercises/hadoop/wordcount; ls

●

$ javac –classpath `hadoop classpath` *.java

●

$ jar cvf wc.jar *.class

WordCount.java WordMapper.java SumReducer.java

●

$ hadoop jar wc.jar WordCount data/ato100.txt ato_wc

●

$ hadoop fs ls ato_wc $ hadoop fs -cat ato_wc/part-r-00000 | less $ hadoop fs -cat ato_wc/* | grep ‘ATO\|CSIRO’

●

$ hadoop fs -rm -r ato_wc

● Average max temperature ●

●

$ cd ~/bigdata/Exercises/hadoop/data $ less nsw_temp.csv $ less bom_data_Note.txt

●

map(String input_key, String input_value):emit(input_value[3], input_value[5])

(‘IDCJAC0010,061087,1965,01,02,32.2,1,Y’)->(‘01’, 32.2)

(‘IDCJAC0010,066062,1890,04,27,20.2,1,Y’)->(‘04’, 20.2)

(‘IDCJAC0010,066062,2012,02,03,21.0,1,Y’)->(‘02’, 21.1)

●

reduce(String month, Iterator<double> values)set count = 0

set sum = 0foreach v in values:

sum += v count++ set mean = sum/count

emit(month, mean)

● $ cd ../averagetemp $ gedit *.java&

●

$ cd ../wordcount $ gedit *.java&

AverageTemp.java AverageTempMapper.java AverageReducer.java

●●

$ hadoop fs -put ../data/nsw_temp.csv data

$ javac –classpath `hadoop classpath` *.java $ jar cvf avt.jar *.class $ hadoop jar avt.jar AverageTemp data/nsw_temp.csv avt

● $ hadoop fs -cat avt/part-1-00000

~/bigdata/Exercises/hadoop/averagetemp/sample_solution

●○

○

●●●

○

●●●

●

●

●○○

●

●○

●

●

●●

●●

●●●

●○○○

●○

●○○○

●

●

●

○○○○○○

https://github.com/tomaszbednarz/pig-abc-toilets

● We have list of local ABC Radio stations in Australia

● We have list of all Public Toilets across Australia

● We want to find a closest toilet to a Radio Station

Demonstration of:

● Data Schemas● Use of external libraries● Google Maps API

●

●

●

●

●

●

●

●●●●●●●●

http://www.cloudera.com/content/cloudera/en/home.html

http://hadoop.apache.org

http://pig.apache.org

https://github.com/linkedin/datafu

https://github.com/tomaszbednarz/pig-abc-toilets

hadoop, hdfs, mapreduce and pig

Technology