Transcript
Page 1: Teaching Your Machine To Find Fraudsters

TEACHING YOUR MACHINE TO FIND FRAUDSTERS

Ian [email protected] twitter.com/ianbarber

Page 3: Teaching Your Machine To Find Fraudsters

5%3%

.1%8%

SOME SMALL NUMBERS

Page 4: Teaching Your Machine To Find Fraudsters

99%ACCURACY

Page 5: Teaching Your Machine To Find Fraudsters

REALLY LEGITIMATE

REALLY FRAUD

EVALUATED LEGITIMATE

989 0

EVALUATED FRAUD

10 1

Page 6: Teaching Your Machine To Find Fraudsters

REALLY LEGITIMATE

REALLY FRAUD

EVALUATED LEGITIMATE

989 0

EVALUATED FRAUD

10 1

90% WRONG

Page 7: Teaching Your Machine To Find Fraudsters

ANOMALY DETECTION

Page 8: Teaching Your Machine To Find Fraudsters

0

7.5

15

22.5

30

Clic

ks

Date

Page 9: Teaching Your Machine To Find Fraudsters

Detector

User Clicks Ad

Alarm

No Alarm

Landing Page

SOFTWAREARCHITECTURE

Buffer

Page 10: Teaching Your Machine To Find Fraudsters

Threshold

ExpectedClicks

Alarm

Sensitivity

Data Buffer

statistics

DETECTOR

Page 11: Teaching Your Machine To Find Fraudsters

function detect($sen) { $window = array(); $i = 0; $alarmCount = 0; $dtd = 0; $avg = $stddev = 0; $fraud = fopen("fraudclicks.csv", 'r'); while($d = fgetcsv($fraud)) { $i++; if(count($window) > 7) { array_shift($window); $avg = array_sum($window) / 7; foreach($window as $val) { $stddev += pow($val - $average, 2); } $stddev = sqrt($stddev/7);

average.php

Page 12: Teaching Your Machine To Find Fraudsters

0

0.05

0.1

0.15

0.2

1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20

Page 13: Teaching Your Machine To Find Fraudsters

if($d[1] > ($avg + ($sen * $stddev))){ $alarmCount++; if($i > 201) { break; } } else { if($i > 201) { $dtd++; } } } array_push($window, $d[1]); } return array($alarmCount-1, $dtd);}

Page 14: Teaching Your Machine To Find Fraudsters

0

7.5

15

22.5

30

Clic

ks

Date

18 False Alarms 1 Day To Detect

1.6 SENSITIVITY

Page 15: Teaching Your Machine To Find Fraudsters

0

7.5

15

22.5

30

Clic

ks

Date

1 False Alarm 18 Days To Detect

2.7 SENSITIVITY

Page 16: Teaching Your Machine To Find Fraudsters

SICKNESSAVAILABILITY

Page 17: Teaching Your Machine To Find Fraudsters

function detect($sens) { $i = 0; $alarms = 0; $dtd = 0; $window = array(); $avail = array(); $fraud = fopen("fraudclicks.csv", 'r'); while($dat = fgetcsv($fraud)) { $dow = date("w", strtotime($dat[0])); if( count($window) >= 7 && isset($avail[$dow]) ) {

$sick = 0; foreach($window as $day => $value) { $dowavg = array_sum($avail[$day]) / count($avail[$day]); $sick += $value / $dowavg; } $sick /= count($window);

sickavail.php

Page 18: Teaching Your Machine To Find Fraudsters

$avlblty = array_sum($avail[$dow]) / count($avail[$dow]); $est = $sick * $avlblty; $fac = fac($dat[1]); $p = exp(-$est) * pow($est,$dat[1]) / $fac; // poisson calc

if($p < $sens && $dat[1] > $est) { $alarms++; if($i > 201) { break; } } else { if($i > 201) { $dtd++; } }

} // end if

Page 19: Teaching Your Machine To Find Fraudsters

0

0.05

0.1

0.15

0.2

1 2 3 4 5 6 7 8 9 10

Page 20: Teaching Your Machine To Find Fraudsters

0

7.5

15

22.5

30

Clic

ks

Date

1 False Alarm 1 Day To Detect

0.011 SENSITIVITY

Page 21: Teaching Your Machine To Find Fraudsters

SUPERVISED CLASSIFIERS

Page 22: Teaching Your Machine To Find Fraudsters

Classifier

User Purchase

Fraud

Not FraudTransactionProcessor

Transaction Database Learner

classification model

SOFTWAREARCHITECTURE

Page 23: Teaching Your Machine To Find Fraudsters

LearnerTraining Data Model

Model

Test Data

ClassifierPredictionAccuracy

EVALUATING THE CLASSIFIER

Page 24: Teaching Your Machine To Find Fraudsters

0

5

10

15

20

0 5 10 15 20

Page 25: Teaching Your Machine To Find Fraudsters

?

0

5

10

15

20

0 5 10 15 20

Page 26: Teaching Your Machine To Find Fraudsters

?

0

5

10

15

20

0 5 10 15 20

Page 27: Teaching Your Machine To Find Fraudsters

$docs = array( array('fraud' => false, 'price' => 1699, 'desc'=>'toy ninja', 'ship' => 'US'), array('fraud' => false, 'price' => 20000, 'desc' => 'TV','ship' => 'US'), array('fraud' => false, 'price' => 2500, 'desc' => 'cds', 'ship' => 'US'), array('fraud' => true, 'price' => 20000, 'desc' => 'console', 'ship' => 'CN'), array('fraud' => true, 'price' => 5000, 'desc' => 'books', 'ship' => 'US'), array('fraud' => true, 'price' => 15000, 'desc' => 'ipod', 'ship' => 'CN'), );

Page 28: Teaching Your Machine To Find Fraudsters

$db = new XapianWritableDatabase("index", Xapian::DB_CREATE_OR_OPEN);$idx = new XapianTermGenerator();$stem = new XapianStem("english");$idx->set_stemmer($stem);

foreach($docs as $key => $doc) { $xdoc = new XapianDocument(); $xdoc->set_data($doc['fraud'] ? "fraud" : "clean"); $idx->set_document($xdoc); $idx->index_text($doc['price'] . ' ' . $doc['desc'] . ' ' . $doc['ship']); $db->add_document($xdoc, $key);}$db = null;

fraudknn.php

Page 29: Teaching Your Machine To Find Fraudsters

$test = array( 'price' => 10000, 'desc' => 'TV', 'ship' => 'CN');

$db = new XapianWritableDatabase("index", Xapian::DB_CREATE_OR_OPEN);$idx = new XapianTermGenerator();$stem = new XapianStem("english");$idx->set_stemmer($stem);

$xdoc = new XapianDocument();$idx->set_document($xdoc);$idx->index_text($test['price'] . ' ' . $test['desc'] . ' ' . $test['ship']);$id = $db->add_document($xdoc);

testknn.php

Page 30: Teaching Your Machine To Find Fraudsters

$enq = new XapianEnquire($db);$rset = new XapianRSet();$rset->add_document($id);$eset = $enq->get_eset(10, $rset);$terms = array();$i = $eset->begin();while ( !$i->equals($eset->end()) ) { $terms[] = $i->get_term(); $i->next();}

$q = new XapianQuery( XapianQuery::OP_OR, $terms);$enq->set_query($q);$matches = $enq->get_mset(0, 4, $rset);

Page 31: Teaching Your Machine To Find Fraudsters

$i = $matches->begin();while (!$i->equals($matches->end())) { if($i->get_document()->get_docid() != $id) { $class = $i->get_document()->get_data(); var_dump($class); } $i->next();}$db->delete_document($id);

$ php testknn.php string(5) "clean"string(5) "fraud"string(5) "fraud"

Page 32: Teaching Your Machine To Find Fraudsters

TRANSACTION PARAMETERS

Page 33: Teaching Your Machine To Find Fraudsters
Page 34: Teaching Your Machine To Find Fraudsters
Page 35: Teaching Your Machine To Find Fraudsters

function compareEmailToName($name, $email) { $name = strtolower($name); $email = strtolower($email); $parts = explode(" ", $name); $pcnt = 0; list($user, $dom) = explode("@", $email); $user = str_replace( array(".", "+"), " ", $user); $dom = preg_replace("/\..*/", "", $dom); similar_text($name, $user, $pcnt); if($pcnt > 80) { return 1.0; } similar_text($name, $dom, $pcnt); if($pcnt > 80) { return 0.8; }

email.php

Page 36: Teaching Your Machine To Find Fraudsters

if(count($parts)) { $highest = 0; foreach($parts as $part) { similar_text($user, $part, $pcnt); if($pcnt > 50 && $pcnt > $highest) { $highest = $percent; } similar_text($dom, $part, $pcnt); if($pcnt > 50 && $pcnt > $highest) { $highest = $percent; } } return (1.7 * ($highest/100)) - 1; }

return -1;}

Page 37: Teaching Your Machine To Find Fraudsters
Page 38: Teaching Your Machine To Find Fraudsters
Page 39: Teaching Your Machine To Find Fraudsters
Page 40: Teaching Your Machine To Find Fraudsters

$data = array( 'purchase_value' => 20993, 'geo_country' => 'DE', 'previous_orders' => 1, 'time' => 6, 'timegap' => 146632, 'product_category' => 'small_item', 'delivery_matches_card' => 0, 'geo_ip_matches_card' => 1, 'difference_from_last_trans' => 8755, 'free_shipping' => 0, 'email_like_name' => 0, 'free_email_provider' => 0, 'disposable_email_provider' => 0, 'quantity' => 2, 'fraud' => 0);

Page 41: Teaching Your Machine To Find Fraudsters

SUPPORT VECTOR MACHINES

Page 42: Teaching Your Machine To Find Fraudsters

0

5

10

15

20

0 5 10 15 20

Page 43: Teaching Your Machine To Find Fraudsters

0

5

10

15

20

0 5 10 15 20

Page 44: Teaching Your Machine To Find Fraudsters

0

5

10

15

20

0 5 10 15 20

Page 45: Teaching Your Machine To Find Fraudsters

0

5

10

15

20

0 5 10 15 20

Page 46: Teaching Your Machine To Find Fraudsters

0

5

10

15

20

0 5 10 15 20

Page 47: Teaching Your Machine To Find Fraudsters

$ apt-get install libsvm-dev$ apt-get install libsvm-tools

$ yum install libsvm-devel

$ pecl install svm-beta$ echo extension=svm.so > /etc/php.d/svm.ini$ php -r '$s = new svm(); $m = $s->train(array(array(-1, -1), array(1, 1))); echo $m->predict(array(0, -1));'-1

Page 48: Teaching Your Machine To Find Fraudsters

$fh = fopen('paydata.csv', 'r');$output = array();

while($data = fgetcsv($fh)) { $output[] = array( $data[14] == 1 ? -1 : 1, 1 => ($data[0]/20000.00) - 1.0, // price 2 => $data[1] == 'CN' ? 1.0:-1.0, 3 => $data[1] == 'US' ? 1.0:-1.0, 4 => $data[5] == 'digital' ? 1.0:-1.0, 5 => $data[7] == 1 ? 1.0:-1.0, //geo 6 => $data[6] == 1 ? 1.0:-1.0, // deliv 12 => $data[9] == 1 ? 1.0:-1.0, // ship 13 => ($data[13] / 1.5) - 1.0, // qty );} learn.php

Page 49: Teaching Your Machine To Find Fraudsters

$svm = new svm();$model = $svm->train($output, array(-1 => 0.65, 1 => 0.5));$model->save('learn.model');

$fp = $tp = $fn = $tn = 0;foreach($output as $test) { $res = $model->predict($test); if($test[0] > 0) { if($res > 0) { $tp++; } else { $fn++; } } else { if($res > 0) { $fp++; } else { $tn++; } }}

Page 50: Teaching Your Machine To Find Fraudsters

// ...snip.. loading test data from // paytest.csv

$model = new SVMModel('learn.model');

$fp = $tp = $fn = $tn = 0;foreach($output as $test) { $res = $model->predict($test); if($test[0] > 0) { if($res > 0) { $tp++; } else { $fn++; } } else { if($res > 0) { $fp++; } else { $tn++; } }}

test.php

Page 51: Teaching Your Machine To Find Fraudsters

var_dump("True Positive " . $tp);var_dump("True Negative " . $tn);var_dump("False Positive " . $fp);var_dump("False Negative " . $fn);var_dump("Accuracy " . (($tp+$tn)/($tp+$tn+$fp+$fn)));

Page 52: Teaching Your Machine To Find Fraudsters

$ php learn.phpstring(18) "True Positive 8316"string(18) "True Negative 1682"string(16) "False Positive 2"string(16) "False Negative 0"string(15) "Accuracy 0.9998"

$ php test.phpstring(17) "True Positive 844"string(17) "True Negative 155"string(16) "False Positive 0"string(16) "False Negative 1"string(14) "Accuracy 0.999"

Page 53: Teaching Your Machine To Find Fraudsters

Test Verify Update

Automated Manual Manual

training data

Page 54: Teaching Your Machine To Find Fraudsters

Time Series Class Based

Sensitivity Model

Days To Detect

False Alarms

False Positives

False Negatives

Page 55: Teaching Your Machine To Find Fraudsters

(shogun)

Page 57: Teaching Your Machine To Find Fraudsters

Title Slide - CSI http://www.flickr.com/photos/39matt/5241862082 Sickness Availability - Chicago Fire Departmenthttp://www.flickr.com/photos/mike_miley/3929146730/Model Buildings - Ah Ain’t Long For This Whorlhttp://www.flickr.com/photos/chadmiller/98014022/Repeat Customer - McDonald’s Loyalty Cardhttp://www.flickr.com/photos/fsse-info/3658873057/Shipping - FedEx Truckhttp://www.flickr.com/photos/moto_club4ag/4852235145/Velocity - Chevrolet Chevelle Dragsterhttp://www.flickr.com/photos/jns001/2958999006/GeoIP - Earth Asia Terminator Viewhttp://www.flickr.com/photos/flyingsinger/86898564/Multiple Items - Boxes http://www.flickr.com/photos/skrewtape/851672959/


Top Related