ingesting and manipulating data with javascript
TRANSCRIPT
Ingesting and Manipulating Data
with Javascript
Produces the world’s largest open source user conference dedicated to Lucene/Solr
Lucidworks is the primary sponsor of the Apache Solr project
Employs over 40% of the active committers on the Solr project
Contributes over 70% of Solr's open source codebase
40%
70%
Based in San Francisco
Offices in Bangalore, Bangkok, New York City, Raleigh, London
Over 300 customers across the Fortune 1000
Fusion, a Solr-powered platform for search-driven apps
An optimized search experience
for every user using relevance
boosting and machine learning.
Create custom search and
discovery applications in
minutes.
Highly scalable search
engine and NoSQL
datastore that gives you
instant access to all your
data.
Lucidworks Fusion product suite
• 50+ connectors
• Full SQL compatibility
• End-to-end security
• Multi-dimensional real-time
ingestion
• Administration and analytics
• Personalized
recommendations
• Machine learning out-of-the-
box
• Powerful recommenders
and classifiers
• Predictive search
• Point-and-click relevancy
tuning
• Quick prototyping
• Fine-grained security
• Stateless architecture
• Support 25+ data platforms
• Full library of components
• Pre-tested reusable
modules
Fusion Pipelines
Index Pipeline
Fusion Query Pipeline
Javascript Index Pipeline Stage
This is a
Fusion
Javascript
Pipeline stage
Why Javascript?
Javascript vs
Pipeline Stage
o Existential discussion at Lucidworks
o My opinion only…
Pipeline stages
are good for…
And…
Not…
o 20 discrete operations I have to do to convert one
field…
o Conditional operations (if this then this, otherwise
do this other thing)
o Canned functionality you have elsewhere.
o I don’t want to do anything that feels like
programming in form fields…
com.lucidworks.apollo.common.pipeline.PipelineDocument
PipelineDocument Highlights
https://doc.lucidworks.com/fusion-pipeline-
javadocs/3.1/com/lucidworks/apollo/common/pipeline/PipelineDocument.html
PipelineDocument{
…
addField(name, value);
getAllFieldNames(); //include internal use names
getFieldNames(); //exclude internal use names
getFirstField(name);
getLastField(name);
removeFields(name);
setField(name, value);
...
}
The Javascript Function
Basic
function (doc) {
// do really important things.
return doc;
}
With Context
function (doc, ctx) {
// do really important things.
return doc;
}
https://doc.lucidworks.com/fusion-pipeline-
javadocs/3.1/com/lucidworks/apollo/pipeline/Context.html
With Collection
function (doc, ctx, collection) {
// do really important things.
return doc;
}
With solrServer
function (doc, ctx, collection, solrServer) {
// do really important things.
// solrServer can index/query things
return doc;
}
https://doc.lucidworks.com/fusion-pipeline-
javadocs/3.1/com/lucidworks/apollo/component/
BufferingSolrServer.html
With
solrServerFactory
aka
SolrClientFactory
function (doc, ctx, collection, solrServer,
solrServerFactory) {
// do really important things.
// solrServerFactory look up other collections
return doc;
}
https://doc.lucidworks.com/fusion-pipeline-
javadocs/3.1/com/lucidworks/apollo/component/
SolrClientFactory.html
Common Problems
Add a Field
function (doc) {
// replace any values currently in
the field with new ones
doc.setField('some-new-field',
'some field value');
// for multi value fields this will
combine values with old values if
there are any, otherwise it will add a
new field.
doc.addField('some-new-field',
'some field value');
return doc;
}
Glue Two
Fields
function(doc) {
var value = "";
if (doc.hasField("Actor1Geo_Lat") &&
doc.hasField("Actor1Geo_Long")) {
value =
doc.getFirstFieldValue("Actor1Geo_Lat") + "," +
doc.getFirstFieldValue("Actor1Geo_Long");
doc.addField("Actor1Geo_p", value);
}
return doc;
}
Iterate through the fields
function (doc) {
// list of doc fields to iterate over
var fields = doc.getFieldNames().toArray();
for (var i=0;i < fields.length;i++) {
var fieldName = fields[i];
var fieldValue = doc.getFirstFieldValue(fieldName);
logger.info("field name:" +fieldName + ", field name: " +
fieldValue);
}
}
return doc;
}
Logging
logger.info("field name:" +fieldName + ", field name: " +
fieldValue);
fusion/3.1.x/var/log/connectors/connectors.log
Preview a field
function(doc){
if (doc.getId() != null) {
var fromField = "body_t";
var toField = "preview_t";
var value =
doc.getFirstFieldValue(fromField);
var pattern = /\n|\t/g;
value = value.replace(pattern, " ");
value = value ? value : "";
}
var length = value.length < 500 ?
value.length : 500;
value = value.substr(0,length);
doc.addField(toField, value);
}
return doc;
}
Bust up a
document
function (doc) {
var field = doc.getFieldValues('price');
var id = doc.getId();
var newDocs = [];
for (i = 0; i < field.size(); i++) {
newDocs.push( { 'id' : id+'-'+i,
'fields' : [ {'name' : 'subject', 'value' :
field.get(i) } ] } );
}
return newDocs;
}
Look up in another collection
function doWork(doc, ctx, collection,
solrServer, solrServerFactory) {
var imports = new JavaImporter(
org.apache.solr.client.solrj.SolrQuery,
org.apache.solr.client.solrj.util.ClientUtils);
with(imports) {
var sku = doc.getFirstFieldValue("sku");
if (!doc.hasField("mentions")) {
var mentions = ""
var productsSolr = solrServerFactory.getSolrServer("products");
Look up in another collection
if( productsSolr != null ){
var q = "sku:"+sku;
var query = new SolrQuery();
query.setRows(100);
query.setQuery(q);
var res = productsSolr.query(query);
mentions = res.getResults().size();
doc.addField("mentions",mentions);
}
}
}
Reject a
document
function (doc) {
if (doc.hasValue('foo')) {
return null; // stop this document from being indexed.
}
return doc;
}
Java +
Javascript
var ArrayList = Java.type("java.util.ArrayList");
var a = new ArrayList;
Next Steps
o Grab Fusion https://lucidworks.com/download/
o Ingest some data
o Create a JavaScript pipeline stage and manipulate the data
o https://doc.lucidworks.com/fusion/latest/Indexing_Data/Custom-JavaScript-Indexing-
Stages.html
o Attend a training
o Get support
Thank You