jan 2012 hug: rhadoop

18
RHadoop, Hadoop for R

Upload: yahoo-developer-network

Post on 06-May-2015

5.798 views

Category:

Technology


1 download

DESCRIPTION

RHadoop is an open source project aiming to combine two rising star in the analytics firmament: R and Hadoop. With more than 2M users, R is arguably the dominant language to express complex statistical computations. Hadoop needs no introduction at HUG. With RHadoop we are trying to combine the expressiveness of R with the scalability of Hadoop and to pave the way for the statistical community to tackle big data with the tools they are familiar with. At this time RHadoop is a collection of three packages that interface with HDFS, HBase and mapreduce, respectively. For mapreduce, the package is called rmr and we tried to give it a simple, high level interface that's true to the mapreduce model and integrated with the rest of the language. We will cover the API and provide some examples.

TRANSCRIPT

Page 1: Jan 2012 HUG: RHadoop

RHadoop, Hadoop for R

Page 2: Jan 2012 HUG: RHadoop

r4stats.com

Page 3: Jan 2012 HUG: RHadoop
Page 4: Jan 2012 HUG: RHadoop

rhdfs

rhbase

rmr

Page 5: Jan 2012 HUG: RHadoop

sapply(data, function)

mapreduce(data, function)

#!/usr/bin/Rscript

library(rmr)

mapreduce(…)

Page 6: Jan 2012 HUG: RHadoop

Hive, Pig

Rmr, Rhipe, Dumbo, Pydoop

Hadoopy

Java, C++

Cascalog, Scalding, Scrunch

Cascading, Crunch

Rmr

Expose MR Hide MR

Page 7: Jan 2012 HUG: RHadoop

#!/usr/bin/pythonimport sysfrom math import fabsfrom org.apache.pig.scripting import Pig

filename = "student.txt"k = 4tolerance = 0.01

MAX_SCORE = 4MIN_SCORE = 0MAX_ITERATION = 100

# initial centroid, equally divide the spaceinitial_centroids = ""last_centroids = [None] * kfor i in range(k):    last_centroids[i] = MIN_SCORE + float(i)/k*(MAX_SCORE-MIN_SCORE)    initial_centroids = initial_centroids + str(last_centroids[i])    if i!=k-1:        initial_centroids = initial_centroids + ":"

P = Pig.compile("""register udf.jar                   DEFINE find_centroid FindCentroid('$centroids');                   raw = load 'student.txt' as (name:chararray, age:int, gpa:double);                   centroided = foreach raw generate gpa, find_centroid(gpa) as centroid;                   grouped = group centroided by centroid;                   result = foreach grouped generate group, AVG(centroided.gpa);                   store result into 'output';                """)

converged = Falseiter_num = 0while iter_num<MAX_ITERATION:    Q = P.bind({'centroids':initial_centroids})    results = Q.runSingle()

Page 8: Jan 2012 HUG: RHadoop

    if results.isSuccessful() == "FAILED":        raise "Pig job failed"    iter = results.result("result").iterator()    centroids = [None] * k    distance_move = 0    # get new centroid of this iteration, caculate the moving distance with last iteration    for i in range(k):        tuple = iter.next()        centroids[i] = float(str(tuple.get(1)))        distance_move = distance_move + fabs(last_centroids[i]-centroids[i])    distance_move = distance_move / k;    Pig.fs("rmr output")    print("iteration " + str(iter_num))    print("average distance moved: " + str(distance_move))    if distance_move<tolerance:        sys.stdout.write("k-means converged at centroids: [")        sys.stdout.write(",".join(str(v) for v in centroids))        sys.stdout.write("]\n")        converged = True        break    last_centroids = centroids[:]    initial_centroids = ""    for i in range(k):        initial_centroids = initial_centroids + str(last_centroids[i])        if i!=k-1:            initial_centroids = initial_centroids + ":"    iter_num += 1

if not converged:    print("not converge after " + str(iter_num) + " iterations")    sys.stdout.write("last centroids: [")    sys.stdout.write(",".join(str(v) for v in last_centroids))    sys.stdout.write("]\n")

Page 9: Jan 2012 HUG: RHadoop

import java.io.IOException;

import org.apache.pig.EvalFunc;import org.apache.pig.data.Tuple;

public class FindCentroid extends EvalFunc<Double> {    double[] centroids;    public FindCentroid(String initialCentroid) {        String[] centroidStrings = initialCentroid.split(":");        centroids = new double[centroidStrings.length];        for (int i=0;i<centroidStrings.length;i++)            centroids[i] = Double.parseDouble(centroidStrings[i]);    }    @Override    public Double exec(Tuple input) throws IOException {        double min_distance = Double.MAX_VALUE;        double closest_centroid = 0;        for (double centroid : centroids) {            double distance = Math.abs(centroid - (Double)input.get(0));            if (distance < min_distance) {                min_distance = distance;                closest_centroid = centroid;            }        }        return closest_centroid;    }

}

Page 10: Jan 2012 HUG: RHadoop

mapreduce(input, output, map, reduce)

one or more hdfs paths or output of other mapreduce jobs

hdfs path, default to temp location

a function of two args returning a keyval(), default identity

a function of two args returning a keyval(), default none

Page 11: Jan 2012 HUG: RHadoop

map = function(k, v) if (hash(k) %% 10 == 0)keyval(k, v)

reduce = function(k, vv) keyval(k, length(vv))

Page 12: Jan 2012 HUG: RHadoop

condition = function(x) x >10

out = mapreduce( input = input, map = function(k,v) if (condition(v)) keyval(k,v))

Page 13: Jan 2012 HUG: RHadoop

x = from.dfs(hdfs.object)

hdfs.object = to.dfs(x)

Page 14: Jan 2012 HUG: RHadoop

INSERT OVERWRITE TABLE pv_gender_sum

SELECT pv_users.gender, count (DISTINCT pv_users.userid)

FROM pv_users 

GROUP BY pv_users.gender;

mapreduce(input = 

  mapreduce(input = "pv_users",  

    map = function(k, v) keyval(v['userid'], v['gender']), 

    reduce = function(k, vv) keyval(k, vv[[1]]),

  output  = "pv_gender_sum",

  map = function(k,v) keyval(v, 1)

  reduce = function(k, vv) keyval(k, sum(unlist(vv)))

Page 15: Jan 2012 HUG: RHadoop

kmeans =  function(points, ncenters, iterations = 10,            distfun = function(a,b) norm(as.matrix(a-b), type = 'F')){    newCenters = kmeans.iter(points, distfun, ncenters = ncenters)    for(i in 1:iterations) {      newCenters = lapply(values(newCenters), unlist)      newCenters = kmeans.iter(points, distfun, centers = newCenters)}    newCenters}

kmeans.iter =  function(points, distfun, ncenters = length(centers), centers = NULL) {    from.dfs(      mapreduce(input = points,        map = if (is.null(centers)) {                function(k,v)keyval(sample(1:ncenters,1),v)}              else {                function(k,v) {                  distances = lapply(centers,function(c)distfun(c,v))                  keyval(centers[[which.min(distances)]],v)}},        reduce = function(k,vv)                    keyval(NULL,apply(do.call(rbind,vv),2,mean))))}

Page 16: Jan 2012 HUG: RHadoop

input.specs, output.specscombinereduce.on.data.frametuning.paramsverbose

local, hadoop backendsprofilingmanaged IOoptimize

Page 17: Jan 2012 HUG: RHadoop

mapreduce(mapreduce(…

mapreduce(input = c(input1, input2), …)

equijoin(left.input = input1, right.input = input2, …)

out1 = mapreduce(…)mapreduce(input = out1, <xyz>)mapreduce(input = out1, <abc>)

abstract.job = function(input, output, …) { … result = mapreduce(input = input, output = output) … result}

Page 18: Jan 2012 HUG: RHadoop

repogithub.com/RevolutionAnalytics/

RHadoop/

licenseApache 2.0

documentationR help, github wiki

Q/Agithub issue tracking

[email protected]

project leadDavid Champagne