[DCS] Adding GA version of document conversion

samir-patel · samir-patel · commit aa4679e53ed6 · 2015-12-14T16:05:00.000-05:00
This commit includes the GA version of the api, tests,
and examples for the Document Conversion service (v1)
diff --git a/README.md b/README.md
@@ -296,7 +296,7 @@ var fs = require('fs');
 var document_conversion = watson.document_conversion({
   username:     '<username>',
   password:     '<password>',
-  version:      'v1-experimental',
+  version:      'v1',
   version_date: '2015-12-01'
 });
 
@@ -682,5 +682,5 @@ See [CONTRIBUTING](https://github.com/watson-developer-cloud/node-sdk/blob/maste
 [npm_link]: https://www.npmjs.com/package/watson-developer-cloud
 [request_github]: https://github.com/request/request
 [examples]: https://github.com/watson-developer-cloud/node-sdk/tree/master/examples
-[document_conversion_integration_example]: https://github.com/watson-developer-cloud/node-sdk/tree/master/examples/document_conversion_integration.v1-experimental.js
+[document_conversion_integration_example]: https://github.com/watson-developer-cloud/node-sdk/tree/master/examples/document_conversion_integration.v1.js
 [license]: http://www.apache.org/licenses/LICENSE-2.0
diff --git a/examples/document_conversion.v1.js b/examples/document_conversion.v1.js
@@ -0,0 +1,29 @@
+'use strict';
+
+var watson = require('watson-developer-cloud');
+var fs = require('fs');
+
+var document_conversion = watson.document_conversion({
+  username: 'INSERT YOUR USERNAME FOR THE SERVICE HERE',
+  password: 'INSERT YOUR PASSWORD FOR THE SERVICE HERE',
+  version: 'v1'
+});
+
+// convert a single document
+document_conversion.convert({
+  // (JSON) ANSWER_UNITS, NORMALIZED_HTML, or NORMALIZED_TEXT
+  file: fs.createReadStream(__dirname + '/resources/example.html'),
+  conversion_target: document_conversion.conversion_target.ANSWER_UNITS,
+  config: {
+    // split the html file by "h2", "h3" and "h4" tags
+    html_to_answer_units: {
+      selectors: [ 'h2','h3', 'h4']
+    }
+  }
+}, function (err, response) {
+  if (err) {
+    console.error(err);
+  } else {
+    console.log(JSON.stringify(response, null, 2));
+  }
+});
diff --git a/examples/document_conversion_integration.v1.js b/examples/document_conversion_integration.v1.js
@@ -0,0 +1,151 @@
+'use strict';
+
+/*
+Document Conversion integration with Retrieve and Rank
+
+The Document Conversion integration example shows how to convert a document into Answer Units by using the
+Document Conversion Service and upload it to the Retrieve and Rank Service to make the Answer Units searchable.
+
+ 1. Create a solr cluster, upload the solr configuration and create a collection
+    1.1 In the files retrieve_and_ran_lifecycle.v1.js  and retrieve_and_rank_solr.v1.js you will find example functions
+        on how to perform these steps.
+    1.2 IMPORTANT: When uploading the solr configuration, use the [answer_unit_config.zip] from the resources folder,
+    which includes a schema.xml that defines the fields that will be indexed.
+ 2. Edit the file document_conversion_integration.v1.js and enter the following:
+    2.1 service credentials for the Document Conversion and the Retrieve and Rank services (each service instance has a
+        different set of credentials)
+    2.2 clusterId (obtained when creating the cluster)
+    2.3 collectionName and inputDocument if you are using a different value from the default
+ 3. Run the following command:
+      node document_conversion_integration.v1.js
+*/
+
+var watson = require('watson-developer-cloud');
+var async = require('async');
+var fs = require('fs');
+
+/*
+Insert the credentials for your Retrieve and Rank service instance
+NOTE: you cannot use your Bluemix account credentials here
+*/
+var retrieve = watson.retrieve_and_rank({
+  username: 'INSERT YOUR USERNAME FOR THE SERVICE HERE',
+  password: 'INSERT YOUR PASSWORD FOR THE SERVICE HERE',
+  version: 'v1',
+  url: 'https://gateway.watsonplatform.net/retrieve-and-rank/api'
+});
+
+/*
+Insert the credentials for your Document Conversion service instance
+NOTE: you cannot use your Bluemix account credentials here
+*/
+var document_conversion = watson.document_conversion({
+  username: 'INSERT YOUR USERNAME FOR THE SERVICE HERE',
+  password: 'INSERT YOUR PASSWORD FOR THE SERVICE HERE',
+  version: 'v1'
+});
+
+var clusterId = 'INSERT YOUR CLUSTER ID HERE';
+
+var inputDocument = '/resources/watson-wikipedia.html';
+var collectionName = 'example_collection';
+
+var solrClient = retrieve.createSolrClient({
+  cluster_id: clusterId,
+  collection_name: collectionName
+});
+
+async.waterfall([
+
+  function convert(done) {
+    // convert a single document
+    document_conversion.convert({
+      // (JSON) ANSWER_UNITS, NORMALIZED_HTML, or NORMALIZED_TEXT
+      file: fs.createReadStream(__dirname + inputDocument),
+      conversion_target: document_conversion.conversion_target.ANSWER_UNITS,
+      config: {
+        html_to_html: {
+          specify_content_to_extract: {
+            enabled: true,
+            xpaths: ['//h3']
+          }
+        }
+      }
+    }, function(err, response) {
+      if (err) {
+        console.error(err);
+      } else {
+        done(null, response);
+      }
+    });
+  },
+
+  function indexAndCommit(response, done) {
+    console.log('Indexing a document...');
+    var doc = mapAnswerUnits2SolrDocs(response);
+    solrClient.add(doc, function(err) {
+      if (err) {
+        console.log('Error indexing document: ' + err);
+        done();
+      } else {
+        console.log('Indexed a document.');
+        solrClient.commit(function(err) {
+          if (err) {
+            console.log('Error committing change: ' + err);
+          } else {
+            console.log('Successfully committed changes.');
+          }
+          done();
+        });
+      }
+    });
+  },
+
+  function _search(done) {
+    console.log('Searching all documents.');
+    var query = solrClient.createQuery();
+    // This query searches for the term 'psychological' in the content_text field.
+    // For a wildcard query use:
+    // query.q({ '*' : '*' });
+    query.q({
+      'content_text': 'psychological'
+    });
+
+    solrClient.search(query, function(err, searchResponse) {
+      if (err) {
+        console.log('Error searching for documents: ' + err);
+      } else {
+        console.log('Found ' + searchResponse.response.numFound + ' document(s).');
+        console.log('First document: ' + JSON.stringify(searchResponse.response.docs[0], null, 2));
+      }
+      done();
+    });
+  }
+]);
+
+function mapAnswerUnits2SolrDocs(data) {
+  var answerUnits = data.answer_units;
+  var solrDocList = [];
+  answerUnits.forEach(function(value) {
+    var solrDoc = convertAnswerUnit2SolrDoc(value);
+    solrDocList.push(solrDoc);
+  });
+  return solrDocList;
+}
+
+function convertAnswerUnit2SolrDoc(au) {
+  var solrDoc;
+  var auContents = au.content;
+  auContents.forEach(function(auContent) {
+    if (auContent.media_type === 'text/plain') {
+      solrDoc = {
+        id: au.id,
+        title: au.title,
+        type: au.type,
+        media_type: auContent.media_type,
+        content_text: auContent.text
+      };
+    }
+  });
+  return solrDoc;
+}
diff --git a/services/document_conversion/v1.js b/services/document_conversion/v1.js
@@ -0,0 +1,129 @@
+/**
+ * Copyright 2014 IBM Corp. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+'use strict';
+
+var extend         = require('extend');
+var requestFactory = require('../../lib/requestwrapper');
+var isStream       = require('isstream');
+var omit           = require('object.omit');
+
+function DocumentConversion(options) {
+  // Warn if not specifying version date
+  var version_date = '2015-12-01';
+  if(options && options.version_date) {
+    version_date = options.version_date;
+  } else {
+    console.warn('[DocumentConversion] WARNING: No version_date specified. Using a (possibly old) default. ' +
+                  'e.g. watson.document_conversion({ version_date: "2015-12-01" })');
+  }
+
+  // Default URL
+  var serviceDefaults = {
+    url: 'https://gateway.watsonplatform.net/document-conversion/api',
+    qs: { version: version_date }
+  };
+
+  // Replace default options with user provided
+  this._options = extend(serviceDefaults, options);
+}
+
+
+DocumentConversion.prototype.conversion_target = {
+  ANSWER_UNITS: 'ANSWER_UNITS',
+  NORMALIZED_HTML: 'NORMALIZED_HTML',
+  NORMALIZED_TEXT: 'NORMALIZED_TEXT'
+};
+
+function fixupContentType(params) {
+  if (params.file && params.file.path && /.html?$/.test(params.file.path)) {
+    params.file = {
+      value: params.file,
+      options: {
+        contentType: 'text/html; charset=utf-8'
+      }
+    };
+  }
+}
+
+/**
+ * One-off convert an attached document OR convert a previously uploaded document by ID
+ *
+ * To convert a previously uploaded document, set params.document_id
+ *
+ * @param  {Object} params.conversion_target Must be set to one of ['ANSWER_UNITS', 'NORMALIZED_HTML', 'NORMALIZED_TEXT']
+ * @param  {ReadableStream} [params.file] The document file to convert.
+ */
+DocumentConversion.prototype.convert = function(params, callback) {
+  params = params || {};
+  if (!params.conversion_target || !DocumentConversion.prototype.conversion_target[params.conversion_target]) {
+    var keys = Object.keys(DocumentConversion.prototype.conversion_target);
+    var values = keys.map(function(v) { return DocumentConversion.prototype.conversion_target[v]; });
+
+    callback(new Error('Missing required parameters: conversion_target. Possible values are: ' + values.join(', ')));
+    return;
+  }
+
+  if (!params.file && !params.document_id) {
+    callback(new Error('Missing required parameters: either params.file or params.document_id must be specified'));
+    return;
+  }
+
+  if (params.file && !isStream(params.file) && !params.file.value) {
+    callback(new Error('Missing required parameters: file is not a standard Node.js Stream'));
+    return;
+  }
+
+  var parameters = {
+    options: {
+      method: 'POST',
+      url: '/v1/convert_document',
+      json: true
+    },
+    defaultOptions: this._options
+  };
+
+  // send the parameters in the body or as formData depending on the request
+  if (params.file) {
+    fixupContentType(params);
+    parameters.options.formData = {
+      file: params.file,
+      config: {
+        value: JSON.stringify(omit(params,['file'])),
+        options: {
+          contentType: 'application/json; charset=utf-8'
+        }
+      }
+    };
+  } else {
+    parameters.options.body = params;
+  }
+
+  return requestFactory(parameters, callback);
+};
+
+
+
+// give a clear error message for the deprecated methods
+['getOutput', 'getOutputs', 'getJobLog', 'getJobs', 'getJob', 'createJob', 'getBatchDocument', 'getBatchDocuments',
+  'addDocumentToBatch', 'getDocument', 'getDocuments', 'uploadDocument', 'getBatchDocuments', 'updateBatch', 'getBatch', 'createBatch', 'getBatches'].forEach(function(name) {
+    DocumentConversion.prototype[name] = function deprecated() {
+      throw new Error('The DocumentConversion.' + name + '() method was deprecated and is no longer available, please use convert() instead.');
+  };
+});
+
+
+module.exports = DocumentConversion;
diff --git a/test/test.document_conversion.v1.js b/test/test.document_conversion.v1.js