002 - mapreduce in hadoop
DESCRIPTION
MapReduce in Hadoop programmingTRANSCRIPT
7/21/2019 002 - MapReduce in Hadoop
http://slidepdf.com/reader/full/002-mapreduce-in-hadoop 1/2
Goutam Tadi – [email protected]
MAPREDUCE
MapReduce is a programming model for data processing. Hadoop can run mapreduce programs
written in various languages like Java, Ruby, Python and C++. MapReduce programs run inparallel fashion to analyze data of any size.
Analyse sample data with Hadoop:
Word Count:
package com.tadi.mapreduce.chapter2;
import java.io.IOException;import java.util.StringTokenizer;
import org.apache.hadoop.conf.Configuration;import org.apache.hadoop.fs.Path;import org.apache.hadoop.io.IntWritable;import org.apache.hadoop.io.LongWritable;import org.apache.hadoop.io.Text;import org.apache.hadoop.mapreduce.Job;import org.apache.hadoop.mapreduce.Mapper;import org.apache.hadoop.mapreduce.Reducer;import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
public class WordCount {
public static class WordCountMapper extends
Mapper<LongWritable, Text, Text, IntWritable> {
private final static IntWritable one = new IntWritable(1);private Text word = new Text();
MapReduce Job
Map
Phase
Reduce
Phase
7/21/2019 002 - MapReduce in Hadoop
http://slidepdf.com/reader/full/002-mapreduce-in-hadoop 2/2
Goutam Tadi – [email protected]
public void map(LongWritable key, Text value, Context context)throws IOException, InterruptedException {
String line = value.toString();StringTokenizer tokenizer = new StringTokenizer(line);while (tokenizer.hasMoreTokens()) {
word.set(tokenizer.nextToken());context.write(word, one);
}
}
}
public static class WordCountReducer extends Reducer<Text, IntWritable, Text, IntWritable> {
IntWritable value;
public void reduce(Text key, Iterable<IntWritable> values,Context context) throws IOException, InterruptedException {
int sum = 0;
while (values.iterator().hasNext()) {
sum += values.iterator().next().get();}context.write(key, new IntWritable(sum));
}}
public static void main(String args[]) throws IOException, ClassNotFoundException, InterruptedException{
Configuration conf = new Configuration();@SuppressWarnings("deprecation")Job job = new Job(conf);
job.setJarByClass(WordCount.class);
job.setOutputKeyClass(Text.class); job.setOutputValueClass(IntWritable.class);
job.setMapperClass(WordCountMapper.class); job.setReducerClass(WordCountReducer.class); job.setInputFormatClass(TextInputFormat.class); job.setOutputFormatClass(TextOutputFormat.class);
FileInputFormat.addInputPath(job, new Path(args[0]));FileOutputFormat.setOutputPath(job, new Path(args[1]));
System.exit (job.waitForCompletion(true) ? 0 : 1);}
}
Explanation
baba baba black
sheep….
baba 1
baba 1
black 1..
baba [1,1]
black [1]
sheep [1]…
baba 2
black 1
sheep 1
Input Raw data Map Phase Reduce Phase Output