teaching your machine to find fraudsters
DESCRIPTION
The slides from my talk at PHP Tek 11. When dealing with money online, fraud is an ongoing problem for bothconsumers and sellers. Researchers have been developing statisticaland machine learning techniques to detect shady sellers on auctionsites, spot fraudulent payments on e-commerce systems and catch clickfraud on adverts. While there is no silver bullet, you will learn toflag suspicious activity and help protect your site from scammersusing PHP and a little help from some other technologies.TRANSCRIPT
TEACHING YOUR MACHINE TO FIND FRAUDSTERS
Ian [email protected] twitter.com/ianbarber
https://github.com/ianbarber/FindingFraudsters-Talk
http://joind.in/3429
5%3%
.1%8%
SOME SMALL NUMBERS
99%ACCURACY
REALLY LEGITIMATE
REALLY FRAUD
EVALUATED LEGITIMATE
989 0
EVALUATED FRAUD
10 1
REALLY LEGITIMATE
REALLY FRAUD
EVALUATED LEGITIMATE
989 0
EVALUATED FRAUD
10 1
90% WRONG
ANOMALY DETECTION
0
7.5
15
22.5
30
Clic
ks
Date
Detector
User Clicks Ad
Alarm
No Alarm
Landing Page
SOFTWAREARCHITECTURE
Buffer
Threshold
ExpectedClicks
Alarm
Sensitivity
Data Buffer
statistics
DETECTOR
function detect($sen) { $window = array(); $i = 0; $alarmCount = 0; $dtd = 0; $avg = $stddev = 0; $fraud = fopen("fraudclicks.csv", 'r'); while($d = fgetcsv($fraud)) { $i++; if(count($window) > 7) { array_shift($window); $avg = array_sum($window) / 7; foreach($window as $val) { $stddev += pow($val - $average, 2); } $stddev = sqrt($stddev/7);
average.php
0
0.05
0.1
0.15
0.2
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20
if($d[1] > ($avg + ($sen * $stddev))){ $alarmCount++; if($i > 201) { break; } } else { if($i > 201) { $dtd++; } } } array_push($window, $d[1]); } return array($alarmCount-1, $dtd);}
0
7.5
15
22.5
30
Clic
ks
Date
18 False Alarms 1 Day To Detect
1.6 SENSITIVITY
0
7.5
15
22.5
30
Clic
ks
Date
1 False Alarm 18 Days To Detect
2.7 SENSITIVITY
SICKNESSAVAILABILITY
function detect($sens) { $i = 0; $alarms = 0; $dtd = 0; $window = array(); $avail = array(); $fraud = fopen("fraudclicks.csv", 'r'); while($dat = fgetcsv($fraud)) { $dow = date("w", strtotime($dat[0])); if( count($window) >= 7 && isset($avail[$dow]) ) {
$sick = 0; foreach($window as $day => $value) { $dowavg = array_sum($avail[$day]) / count($avail[$day]); $sick += $value / $dowavg; } $sick /= count($window);
sickavail.php
$avlblty = array_sum($avail[$dow]) / count($avail[$dow]); $est = $sick * $avlblty; $fac = fac($dat[1]); $p = exp(-$est) * pow($est,$dat[1]) / $fac; // poisson calc
if($p < $sens && $dat[1] > $est) { $alarms++; if($i > 201) { break; } } else { if($i > 201) { $dtd++; } }
} // end if
0
0.05
0.1
0.15
0.2
1 2 3 4 5 6 7 8 9 10
0
7.5
15
22.5
30
Clic
ks
Date
1 False Alarm 1 Day To Detect
0.011 SENSITIVITY
SUPERVISED CLASSIFIERS
Classifier
User Purchase
Fraud
Not FraudTransactionProcessor
Transaction Database Learner
classification model
SOFTWAREARCHITECTURE
LearnerTraining Data Model
Model
Test Data
ClassifierPredictionAccuracy
EVALUATING THE CLASSIFIER
0
5
10
15
20
0 5 10 15 20
?
0
5
10
15
20
0 5 10 15 20
?
0
5
10
15
20
0 5 10 15 20
$docs = array( array('fraud' => false, 'price' => 1699, 'desc'=>'toy ninja', 'ship' => 'US'), array('fraud' => false, 'price' => 20000, 'desc' => 'TV','ship' => 'US'), array('fraud' => false, 'price' => 2500, 'desc' => 'cds', 'ship' => 'US'), array('fraud' => true, 'price' => 20000, 'desc' => 'console', 'ship' => 'CN'), array('fraud' => true, 'price' => 5000, 'desc' => 'books', 'ship' => 'US'), array('fraud' => true, 'price' => 15000, 'desc' => 'ipod', 'ship' => 'CN'), );
$db = new XapianWritableDatabase("index", Xapian::DB_CREATE_OR_OPEN);$idx = new XapianTermGenerator();$stem = new XapianStem("english");$idx->set_stemmer($stem);
foreach($docs as $key => $doc) { $xdoc = new XapianDocument(); $xdoc->set_data($doc['fraud'] ? "fraud" : "clean"); $idx->set_document($xdoc); $idx->index_text($doc['price'] . ' ' . $doc['desc'] . ' ' . $doc['ship']); $db->add_document($xdoc, $key);}$db = null;
fraudknn.php
$test = array( 'price' => 10000, 'desc' => 'TV', 'ship' => 'CN');
$db = new XapianWritableDatabase("index", Xapian::DB_CREATE_OR_OPEN);$idx = new XapianTermGenerator();$stem = new XapianStem("english");$idx->set_stemmer($stem);
$xdoc = new XapianDocument();$idx->set_document($xdoc);$idx->index_text($test['price'] . ' ' . $test['desc'] . ' ' . $test['ship']);$id = $db->add_document($xdoc);
testknn.php
$enq = new XapianEnquire($db);$rset = new XapianRSet();$rset->add_document($id);$eset = $enq->get_eset(10, $rset);$terms = array();$i = $eset->begin();while ( !$i->equals($eset->end()) ) { $terms[] = $i->get_term(); $i->next();}
$q = new XapianQuery( XapianQuery::OP_OR, $terms);$enq->set_query($q);$matches = $enq->get_mset(0, 4, $rset);
$i = $matches->begin();while (!$i->equals($matches->end())) { if($i->get_document()->get_docid() != $id) { $class = $i->get_document()->get_data(); var_dump($class); } $i->next();}$db->delete_document($id);
$ php testknn.php string(5) "clean"string(5) "fraud"string(5) "fraud"
TRANSACTION PARAMETERS
function compareEmailToName($name, $email) { $name = strtolower($name); $email = strtolower($email); $parts = explode(" ", $name); $pcnt = 0; list($user, $dom) = explode("@", $email); $user = str_replace( array(".", "+"), " ", $user); $dom = preg_replace("/\..*/", "", $dom); similar_text($name, $user, $pcnt); if($pcnt > 80) { return 1.0; } similar_text($name, $dom, $pcnt); if($pcnt > 80) { return 0.8; }
email.php
if(count($parts)) { $highest = 0; foreach($parts as $part) { similar_text($user, $part, $pcnt); if($pcnt > 50 && $pcnt > $highest) { $highest = $percent; } similar_text($dom, $part, $pcnt); if($pcnt > 50 && $pcnt > $highest) { $highest = $percent; } } return (1.7 * ($highest/100)) - 1; }
return -1;}
$data = array( 'purchase_value' => 20993, 'geo_country' => 'DE', 'previous_orders' => 1, 'time' => 6, 'timegap' => 146632, 'product_category' => 'small_item', 'delivery_matches_card' => 0, 'geo_ip_matches_card' => 1, 'difference_from_last_trans' => 8755, 'free_shipping' => 0, 'email_like_name' => 0, 'free_email_provider' => 0, 'disposable_email_provider' => 0, 'quantity' => 2, 'fraud' => 0);
SUPPORT VECTOR MACHINES
0
5
10
15
20
0 5 10 15 20
0
5
10
15
20
0 5 10 15 20
0
5
10
15
20
0 5 10 15 20
0
5
10
15
20
0 5 10 15 20
0
5
10
15
20
0 5 10 15 20
$ apt-get install libsvm-dev$ apt-get install libsvm-tools
$ yum install libsvm-devel
$ pecl install svm-beta$ echo extension=svm.so > /etc/php.d/svm.ini$ php -r '$s = new svm(); $m = $s->train(array(array(-1, -1), array(1, 1))); echo $m->predict(array(0, -1));'-1
$fh = fopen('paydata.csv', 'r');$output = array();
while($data = fgetcsv($fh)) { $output[] = array( $data[14] == 1 ? -1 : 1, 1 => ($data[0]/20000.00) - 1.0, // price 2 => $data[1] == 'CN' ? 1.0:-1.0, 3 => $data[1] == 'US' ? 1.0:-1.0, 4 => $data[5] == 'digital' ? 1.0:-1.0, 5 => $data[7] == 1 ? 1.0:-1.0, //geo 6 => $data[6] == 1 ? 1.0:-1.0, // deliv 12 => $data[9] == 1 ? 1.0:-1.0, // ship 13 => ($data[13] / 1.5) - 1.0, // qty );} learn.php
$svm = new svm();$model = $svm->train($output, array(-1 => 0.65, 1 => 0.5));$model->save('learn.model');
$fp = $tp = $fn = $tn = 0;foreach($output as $test) { $res = $model->predict($test); if($test[0] > 0) { if($res > 0) { $tp++; } else { $fn++; } } else { if($res > 0) { $fp++; } else { $tn++; } }}
// ...snip.. loading test data from // paytest.csv
$model = new SVMModel('learn.model');
$fp = $tp = $fn = $tn = 0;foreach($output as $test) { $res = $model->predict($test); if($test[0] > 0) { if($res > 0) { $tp++; } else { $fn++; } } else { if($res > 0) { $fp++; } else { $tn++; } }}
test.php
var_dump("True Positive " . $tp);var_dump("True Negative " . $tn);var_dump("False Positive " . $fp);var_dump("False Negative " . $fn);var_dump("Accuracy " . (($tp+$tn)/($tp+$tn+$fp+$fn)));
$ php learn.phpstring(18) "True Positive 8316"string(18) "True Negative 1682"string(16) "False Positive 2"string(16) "False Negative 0"string(15) "Accuracy 0.9998"
$ php test.phpstring(17) "True Positive 844"string(17) "True Negative 155"string(16) "False Positive 0"string(16) "False Negative 1"string(14) "Accuracy 0.999"
Test Verify Update
Automated Manual Manual
training data
Time Series Class Based
Sensitivity Model
Days To Detect
False Alarms
False Positives
False Negatives
(shogun)
TEACHING YOUR MACHINE TO FIND FRAUDSTERS
http://joind.in/3429
Title Slide - CSI http://www.flickr.com/photos/39matt/5241862082 Sickness Availability - Chicago Fire Departmenthttp://www.flickr.com/photos/mike_miley/3929146730/Model Buildings - Ah Ain’t Long For This Whorlhttp://www.flickr.com/photos/chadmiller/98014022/Repeat Customer - McDonald’s Loyalty Cardhttp://www.flickr.com/photos/fsse-info/3658873057/Shipping - FedEx Truckhttp://www.flickr.com/photos/moto_club4ag/4852235145/Velocity - Chevrolet Chevelle Dragsterhttp://www.flickr.com/photos/jns001/2958999006/GeoIP - Earth Asia Terminator Viewhttp://www.flickr.com/photos/flyingsinger/86898564/Multiple Items - Boxes http://www.flickr.com/photos/skrewtape/851672959/