hadoop, hdfs, mapreduce and pig
DESCRIPTION
Open presentation, training material. Presented at CSIRO Big Data 2.0 workshop in September 2013, North Ryde, Australia. Animated by hands-on examples.TRANSCRIPT
●
●
●
●
●●●
●●●●●●●
●●●
●
●●●
●
●
●
●
●
●
●
●
●●
●
●
●
●
> hadoop fs
hadoop fs
●
●
●
●
●
●
●
●
●
$ hadoop fs
● ls
$ hadoop fs –help ls
●
$ hadoop fs –ls <path> $ hadoop fs –ls /
●
$ hadoop fs -ls $ hadoop fs –ls /user/cloudera
●
●
●
$ hadoop fs -mkdir data $ hadoop fs -ls
●
$ cd ~/bigdata/Exercises/hadoop/data $ ls -l $ hadoop fs –put mammograms.zip data
●
● http://localhost:50070
● fsck: an HDFS utility $ hadoop fsck /user/cloudera/data/mammograms.zip \
-blocks -locations -files
●
$ head -n 100 ato_centenary.txt \ | hadoop fs –put - data/ato100.txt
●
$ head -n 1000 ato_centenary.txt \ | hadoop fs –put - data/ato100.txt
●
put: ‘data/ato100.txt': File exists●
$ hadoop fs -rm data/ato100.txt $ head -n 1000 ato_centenary.txt \ | hadoop fs –put - data/ato100.txt
●
$ hadoop fs -cat data/ato100.txt | less
●
$ hadoop fs -get data/ato100.txt ato100.txt
●
-mv, -cp, -rmdir, -stat ...
●
●●●●
●●
●
●
●
●
●
●
○
■
●○
●
○
●○
●○
○○
●
●
●
●
●
●
●
●
●
●
$ javac –classpath `hadoop classpath` *.java
●
$ jar cvf csiro.jar *.class
●
$ hadoop jar csiro.jar Csiro input_dir output_dir
●
○
●●
map(in_key, in_value) -> (inter_key, inter_value) list
●
○
■
■
■
●
●
let map(key, value) =emit(key.toUpper(), value.
toUpper())
(‘csiro’, ‘cci’) -> (‘CSIRO’, ‘CCI’)(‘csiro’, ‘cesre’) -> (‘CSIRO’, ‘CESRE’)(‘csiro’, ‘cmse’) -> (‘CSIRO’, ‘CMSE’)(‘toyota’, ‘yaris’) -> (‘TOYOTA’, ‘YARIS’)
●
let map(key, value) =foreach char c in value:
emit(key, c)
(‘cci’, ‘csiro’) -> (‘cci’, ‘c’), (‘cci’, ’s’),(‘cci’, ‘i’), (‘cci’, ‘r’),(‘cci’, ‘o’)
(‘open’, ‘nasa’) -> (‘open’, ‘n’), (‘open’, ’a’),(‘open’, ‘s’), (‘open’, ‘a’)
●
let map(key, value) =emit(value.length(), value)
(‘csiro’, ‘cci’) -> (‘3’, ‘cci’)(‘csiro’, ‘cesre’) -> (‘5’, ‘cesre’)(‘csiro’, ‘cmse’) -> (‘4’, ‘cmse’)(‘toyota’, ‘yaris’) -> (‘5’, ‘yaris’)
●
●○
○
○
●○
●
map(String input_key, String input_value)foreach word w in input_value:
emit(w, 1)
reduce(String output_key,Iterator<int> intermediate_values)
set count = 0foreach v in intermediate_values:
count += vemit(output_key, count)
● Wordcount $ cd ~/bigdata/Exercises/hadoop/wordcount; ls
●
$ javac –classpath `hadoop classpath` *.java
●
$ jar cvf wc.jar *.class
WordCount.java WordMapper.java SumReducer.java
●
$ hadoop jar wc.jar WordCount data/ato100.txt ato_wc
●
$ hadoop fs ls ato_wc $ hadoop fs -cat ato_wc/part-r-00000 | less $ hadoop fs -cat ato_wc/* | grep ‘ATO\|CSIRO’
●
$ hadoop fs -rm -r ato_wc
● Average max temperature ●
●
$ cd ~/bigdata/Exercises/hadoop/data $ less nsw_temp.csv $ less bom_data_Note.txt
●
map(String input_key, String input_value):emit(input_value[3], input_value[5])
(‘IDCJAC0010,061087,1965,01,02,32.2,1,Y’)->(‘01’, 32.2)
(‘IDCJAC0010,066062,1890,04,27,20.2,1,Y’)->(‘04’, 20.2)
(‘IDCJAC0010,066062,2012,02,03,21.0,1,Y’)->(‘02’, 21.1)
●
reduce(String month, Iterator<double> values)set count = 0
set sum = 0foreach v in values:
sum += v count++ set mean = sum/count
emit(month, mean)
● $ cd ../averagetemp $ gedit *.java&
●
$ cd ../wordcount $ gedit *.java&
AverageTemp.java AverageTempMapper.java AverageReducer.java
●●
$ hadoop fs -put ../data/nsw_temp.csv data
$ javac –classpath `hadoop classpath` *.java $ jar cvf avt.jar *.class $ hadoop jar avt.jar AverageTemp data/nsw_temp.csv avt
● $ hadoop fs -cat avt/part-1-00000
~/bigdata/Exercises/hadoop/averagetemp/sample_solution
●○
○
●●●
○
●●●
●
●●●
●
●
●○○
●
●○
●
●
●●
●●
●●●
●○○○
●○
●○○○
●
●
●
○○○○○○
https://github.com/tomaszbednarz/pig-abc-toilets
● We have list of local ABC Radio stations in Australia
● We have list of all Public Toilets across Australia
● We want to find a closest toilet to a Radio Station
Demonstration of:
● Data Schemas● Use of external libraries● Google Maps API
●
●
●
●
●
●
●