Skip to content

Commit aa4679e

Browse files
committed
[DCS] Adding GA version of document conversion
This commit includes the GA version of the api, tests, and examples for the Document Conversion service (v1)
1 parent d3ba0fa commit aa4679e

File tree

5 files changed

+411
-2
lines changed

5 files changed

+411
-2
lines changed

README.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -296,7 +296,7 @@ var fs = require('fs');
296296
var document_conversion = watson.document_conversion({
297297
username: '<username>',
298298
password: '<password>',
299-
version: 'v1-experimental',
299+
version: 'v1',
300300
version_date: '2015-12-01'
301301
});
302302

@@ -682,5 +682,5 @@ See [CONTRIBUTING](https://github.com/watson-developer-cloud/node-sdk/blob/maste
682682
[npm_link]: https://www.npmjs.com/package/watson-developer-cloud
683683
[request_github]: https://github.com/request/request
684684
[examples]: https://github.com/watson-developer-cloud/node-sdk/tree/master/examples
685-
[document_conversion_integration_example]: https://github.com/watson-developer-cloud/node-sdk/tree/master/examples/document_conversion_integration.v1-experimental.js
685+
[document_conversion_integration_example]: https://github.com/watson-developer-cloud/node-sdk/tree/master/examples/document_conversion_integration.v1.js
686686
[license]: http://www.apache.org/licenses/LICENSE-2.0

examples/document_conversion.v1.js

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,29 @@
1+
'use strict';
2+
3+
var watson = require('watson-developer-cloud');
4+
var fs = require('fs');
5+
6+
var document_conversion = watson.document_conversion({
7+
username: 'INSERT YOUR USERNAME FOR THE SERVICE HERE',
8+
password: 'INSERT YOUR PASSWORD FOR THE SERVICE HERE',
9+
version: 'v1'
10+
});
11+
12+
// convert a single document
13+
document_conversion.convert({
14+
// (JSON) ANSWER_UNITS, NORMALIZED_HTML, or NORMALIZED_TEXT
15+
file: fs.createReadStream(__dirname + '/resources/example.html'),
16+
conversion_target: document_conversion.conversion_target.ANSWER_UNITS,
17+
config: {
18+
// split the html file by "h2", "h3" and "h4" tags
19+
html_to_answer_units: {
20+
selectors: [ 'h2','h3', 'h4']
21+
}
22+
}
23+
}, function (err, response) {
24+
if (err) {
25+
console.error(err);
26+
} else {
27+
console.log(JSON.stringify(response, null, 2));
28+
}
29+
});
Lines changed: 151 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,151 @@
1+
'use strict';
2+
3+
/*
4+
Document Conversion integration with Retrieve and Rank
5+
6+
The Document Conversion integration example shows how to convert a document into Answer Units by using the
7+
Document Conversion Service and upload it to the Retrieve and Rank Service to make the Answer Units searchable.
8+
9+
1. Create a solr cluster, upload the solr configuration and create a collection
10+
1.1 In the files retrieve_and_ran_lifecycle.v1.js and retrieve_and_rank_solr.v1.js you will find example functions
11+
on how to perform these steps.
12+
1.2 IMPORTANT: When uploading the solr configuration, use the [answer_unit_config.zip] from the resources folder,
13+
which includes a schema.xml that defines the fields that will be indexed.
14+
2. Edit the file document_conversion_integration.v1.js and enter the following:
15+
2.1 service credentials for the Document Conversion and the Retrieve and Rank services (each service instance has a
16+
different set of credentials)
17+
2.2 clusterId (obtained when creating the cluster)
18+
2.3 collectionName and inputDocument if you are using a different value from the default
19+
3. Run the following command:
20+
node document_conversion_integration.v1.js
21+
*/
22+
23+
var watson = require('watson-developer-cloud');
24+
var async = require('async');
25+
var fs = require('fs');
26+
27+
/*
28+
Insert the credentials for your Retrieve and Rank service instance
29+
NOTE: you cannot use your Bluemix account credentials here
30+
*/
31+
var retrieve = watson.retrieve_and_rank({
32+
username: 'INSERT YOUR USERNAME FOR THE SERVICE HERE',
33+
password: 'INSERT YOUR PASSWORD FOR THE SERVICE HERE',
34+
version: 'v1',
35+
url: 'https://gateway.watsonplatform.net/retrieve-and-rank/api'
36+
});
37+
38+
/*
39+
Insert the credentials for your Document Conversion service instance
40+
NOTE: you cannot use your Bluemix account credentials here
41+
*/
42+
var document_conversion = watson.document_conversion({
43+
username: 'INSERT YOUR USERNAME FOR THE SERVICE HERE',
44+
password: 'INSERT YOUR PASSWORD FOR THE SERVICE HERE',
45+
version: 'v1'
46+
});
47+
48+
var clusterId = 'INSERT YOUR CLUSTER ID HERE';
49+
50+
var inputDocument = '/resources/watson-wikipedia.html';
51+
var collectionName = 'example_collection';
52+
53+
var solrClient = retrieve.createSolrClient({
54+
cluster_id: clusterId,
55+
collection_name: collectionName
56+
});
57+
58+
async.waterfall([
59+
60+
function convert(done) {
61+
// convert a single document
62+
document_conversion.convert({
63+
// (JSON) ANSWER_UNITS, NORMALIZED_HTML, or NORMALIZED_TEXT
64+
file: fs.createReadStream(__dirname + inputDocument),
65+
conversion_target: document_conversion.conversion_target.ANSWER_UNITS,
66+
config: {
67+
html_to_html: {
68+
specify_content_to_extract: {
69+
enabled: true,
70+
xpaths: ['//h3']
71+
}
72+
}
73+
}
74+
}, function(err, response) {
75+
if (err) {
76+
console.error(err);
77+
} else {
78+
done(null, response);
79+
}
80+
});
81+
},
82+
83+
function indexAndCommit(response, done) {
84+
console.log('Indexing a document...');
85+
var doc = mapAnswerUnits2SolrDocs(response);
86+
solrClient.add(doc, function(err) {
87+
if (err) {
88+
console.log('Error indexing document: ' + err);
89+
done();
90+
} else {
91+
console.log('Indexed a document.');
92+
solrClient.commit(function(err) {
93+
if (err) {
94+
console.log('Error committing change: ' + err);
95+
} else {
96+
console.log('Successfully committed changes.');
97+
}
98+
done();
99+
});
100+
}
101+
});
102+
},
103+
104+
function _search(done) {
105+
console.log('Searching all documents.');
106+
var query = solrClient.createQuery();
107+
// This query searches for the term 'psychological' in the content_text field.
108+
// For a wildcard query use:
109+
// query.q({ '*' : '*' });
110+
query.q({
111+
'content_text': 'psychological'
112+
});
113+
114+
solrClient.search(query, function(err, searchResponse) {
115+
if (err) {
116+
console.log('Error searching for documents: ' + err);
117+
} else {
118+
console.log('Found ' + searchResponse.response.numFound + ' document(s).');
119+
console.log('First document: ' + JSON.stringify(searchResponse.response.docs[0], null, 2));
120+
}
121+
done();
122+
});
123+
}
124+
]);
125+
126+
function mapAnswerUnits2SolrDocs(data) {
127+
var answerUnits = data.answer_units;
128+
var solrDocList = [];
129+
answerUnits.forEach(function(value) {
130+
var solrDoc = convertAnswerUnit2SolrDoc(value);
131+
solrDocList.push(solrDoc);
132+
});
133+
return solrDocList;
134+
}
135+
136+
function convertAnswerUnit2SolrDoc(au) {
137+
var solrDoc;
138+
var auContents = au.content;
139+
auContents.forEach(function(auContent) {
140+
if (auContent.media_type === 'text/plain') {
141+
solrDoc = {
142+
id: au.id,
143+
title: au.title,
144+
type: au.type,
145+
media_type: auContent.media_type,
146+
content_text: auContent.text
147+
};
148+
}
149+
});
150+
return solrDoc;
151+
}

services/document_conversion/v1.js

Lines changed: 129 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,129 @@
1+
/**
2+
* Copyright 2014 IBM Corp. All Rights Reserved.
3+
*
4+
* Licensed under the Apache License, Version 2.0 (the "License");
5+
* you may not use this file except in compliance with the License.
6+
* You may obtain a copy of the License at
7+
*
8+
* http://www.apache.org/licenses/LICENSE-2.0
9+
*
10+
* Unless required by applicable law or agreed to in writing, software
11+
* distributed under the License is distributed on an "AS IS" BASIS,
12+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
* See the License for the specific language governing permissions and
14+
* limitations under the License.
15+
*/
16+
17+
'use strict';
18+
19+
var extend = require('extend');
20+
var requestFactory = require('../../lib/requestwrapper');
21+
var isStream = require('isstream');
22+
var omit = require('object.omit');
23+
24+
function DocumentConversion(options) {
25+
// Warn if not specifying version date
26+
var version_date = '2015-12-01';
27+
if(options && options.version_date) {
28+
version_date = options.version_date;
29+
} else {
30+
console.warn('[DocumentConversion] WARNING: No version_date specified. Using a (possibly old) default. ' +
31+
'e.g. watson.document_conversion({ version_date: "2015-12-01" })');
32+
}
33+
34+
// Default URL
35+
var serviceDefaults = {
36+
url: 'https://gateway.watsonplatform.net/document-conversion/api',
37+
qs: { version: version_date }
38+
};
39+
40+
// Replace default options with user provided
41+
this._options = extend(serviceDefaults, options);
42+
}
43+
44+
45+
DocumentConversion.prototype.conversion_target = {
46+
ANSWER_UNITS: 'ANSWER_UNITS',
47+
NORMALIZED_HTML: 'NORMALIZED_HTML',
48+
NORMALIZED_TEXT: 'NORMALIZED_TEXT'
49+
};
50+
51+
function fixupContentType(params) {
52+
if (params.file && params.file.path && /.html?$/.test(params.file.path)) {
53+
params.file = {
54+
value: params.file,
55+
options: {
56+
contentType: 'text/html; charset=utf-8'
57+
}
58+
};
59+
}
60+
}
61+
62+
/**
63+
* One-off convert an attached document OR convert a previously uploaded document by ID
64+
*
65+
* To convert a previously uploaded document, set params.document_id
66+
*
67+
* @param {Object} params.conversion_target Must be set to one of ['ANSWER_UNITS', 'NORMALIZED_HTML', 'NORMALIZED_TEXT']
68+
* @param {ReadableStream} [params.file] The document file to convert.
69+
*/
70+
DocumentConversion.prototype.convert = function(params, callback) {
71+
params = params || {};
72+
if (!params.conversion_target || !DocumentConversion.prototype.conversion_target[params.conversion_target]) {
73+
var keys = Object.keys(DocumentConversion.prototype.conversion_target);
74+
var values = keys.map(function(v) { return DocumentConversion.prototype.conversion_target[v]; });
75+
76+
callback(new Error('Missing required parameters: conversion_target. Possible values are: ' + values.join(', ')));
77+
return;
78+
}
79+
80+
if (!params.file && !params.document_id) {
81+
callback(new Error('Missing required parameters: either params.file or params.document_id must be specified'));
82+
return;
83+
}
84+
85+
if (params.file && !isStream(params.file) && !params.file.value) {
86+
callback(new Error('Missing required parameters: file is not a standard Node.js Stream'));
87+
return;
88+
}
89+
90+
var parameters = {
91+
options: {
92+
method: 'POST',
93+
url: '/v1/convert_document',
94+
json: true
95+
},
96+
defaultOptions: this._options
97+
};
98+
99+
// send the parameters in the body or as formData depending on the request
100+
if (params.file) {
101+
fixupContentType(params);
102+
parameters.options.formData = {
103+
file: params.file,
104+
config: {
105+
value: JSON.stringify(omit(params,['file'])),
106+
options: {
107+
contentType: 'application/json; charset=utf-8'
108+
}
109+
}
110+
};
111+
} else {
112+
parameters.options.body = params;
113+
}
114+
115+
return requestFactory(parameters, callback);
116+
};
117+
118+
119+
120+
// give a clear error message for the deprecated methods
121+
['getOutput', 'getOutputs', 'getJobLog', 'getJobs', 'getJob', 'createJob', 'getBatchDocument', 'getBatchDocuments',
122+
'addDocumentToBatch', 'getDocument', 'getDocuments', 'uploadDocument', 'getBatchDocuments', 'updateBatch', 'getBatch', 'createBatch', 'getBatches'].forEach(function(name) {
123+
DocumentConversion.prototype[name] = function deprecated() {
124+
throw new Error('The DocumentConversion.' + name + '() method was deprecated and is no longer available, please use convert() instead.');
125+
};
126+
});
127+
128+
129+
module.exports = DocumentConversion;

0 commit comments

Comments
 (0)