|
| 1 | +'use strict'; |
| 2 | + |
| 3 | +/* |
| 4 | +Document Conversion integration with Retrieve and Rank |
| 5 | +
|
| 6 | +The Document Conversion integration example shows how to convert a document into Answer Units by using the |
| 7 | +Document Conversion Service and upload it to the Retrieve and Rank Service to make the Answer Units searchable. |
| 8 | +
|
| 9 | + 1. Create a solr cluster, upload the solr configuration and create a collection |
| 10 | + 1.1 In the files retrieve_and_ran_lifecycle.v1.js and retrieve_and_rank_solr.v1.js you will find example functions |
| 11 | + on how to perform these steps. |
| 12 | + 1.2 IMPORTANT: When uploading the solr configuration, use the [answer_unit_config.zip] from the resources folder, |
| 13 | + which includes a schema.xml that defines the fields that will be indexed. |
| 14 | + 2. Edit the file document_conversion_integration.v1.js and enter the following: |
| 15 | + 2.1 service credentials for the Document Conversion and the Retrieve and Rank services (each service instance has a |
| 16 | + different set of credentials) |
| 17 | + 2.2 clusterId (obtained when creating the cluster) |
| 18 | + 2.3 collectionName and inputDocument if you are using a different value from the default |
| 19 | + 3. Run the following command: |
| 20 | + node document_conversion_integration.v1.js |
| 21 | +*/ |
| 22 | + |
| 23 | +var watson = require('watson-developer-cloud'); |
| 24 | +var async = require('async'); |
| 25 | +var fs = require('fs'); |
| 26 | + |
| 27 | +/* |
| 28 | +Insert the credentials for your Retrieve and Rank service instance |
| 29 | +NOTE: you cannot use your Bluemix account credentials here |
| 30 | +*/ |
| 31 | +var retrieve = watson.retrieve_and_rank({ |
| 32 | + username: 'INSERT YOUR USERNAME FOR THE SERVICE HERE', |
| 33 | + password: 'INSERT YOUR PASSWORD FOR THE SERVICE HERE', |
| 34 | + version: 'v1', |
| 35 | + url: 'https://gateway.watsonplatform.net/retrieve-and-rank/api' |
| 36 | +}); |
| 37 | + |
| 38 | +/* |
| 39 | +Insert the credentials for your Document Conversion service instance |
| 40 | +NOTE: you cannot use your Bluemix account credentials here |
| 41 | +*/ |
| 42 | +var document_conversion = watson.document_conversion({ |
| 43 | + username: 'INSERT YOUR USERNAME FOR THE SERVICE HERE', |
| 44 | + password: 'INSERT YOUR PASSWORD FOR THE SERVICE HERE', |
| 45 | + version: 'v1' |
| 46 | +}); |
| 47 | + |
| 48 | +var clusterId = 'INSERT YOUR CLUSTER ID HERE'; |
| 49 | + |
| 50 | +var inputDocument = '/resources/watson-wikipedia.html'; |
| 51 | +var collectionName = 'example_collection'; |
| 52 | + |
| 53 | +var solrClient = retrieve.createSolrClient({ |
| 54 | + cluster_id: clusterId, |
| 55 | + collection_name: collectionName |
| 56 | +}); |
| 57 | + |
| 58 | +async.waterfall([ |
| 59 | + |
| 60 | + function convert(done) { |
| 61 | + // convert a single document |
| 62 | + document_conversion.convert({ |
| 63 | + // (JSON) ANSWER_UNITS, NORMALIZED_HTML, or NORMALIZED_TEXT |
| 64 | + file: fs.createReadStream(__dirname + inputDocument), |
| 65 | + conversion_target: document_conversion.conversion_target.ANSWER_UNITS, |
| 66 | + config: { |
| 67 | + html_to_html: { |
| 68 | + specify_content_to_extract: { |
| 69 | + enabled: true, |
| 70 | + xpaths: ['//h3'] |
| 71 | + } |
| 72 | + } |
| 73 | + } |
| 74 | + }, function(err, response) { |
| 75 | + if (err) { |
| 76 | + console.error(err); |
| 77 | + } else { |
| 78 | + done(null, response); |
| 79 | + } |
| 80 | + }); |
| 81 | + }, |
| 82 | + |
| 83 | + function indexAndCommit(response, done) { |
| 84 | + console.log('Indexing a document...'); |
| 85 | + var doc = mapAnswerUnits2SolrDocs(response); |
| 86 | + solrClient.add(doc, function(err) { |
| 87 | + if (err) { |
| 88 | + console.log('Error indexing document: ' + err); |
| 89 | + done(); |
| 90 | + } else { |
| 91 | + console.log('Indexed a document.'); |
| 92 | + solrClient.commit(function(err) { |
| 93 | + if (err) { |
| 94 | + console.log('Error committing change: ' + err); |
| 95 | + } else { |
| 96 | + console.log('Successfully committed changes.'); |
| 97 | + } |
| 98 | + done(); |
| 99 | + }); |
| 100 | + } |
| 101 | + }); |
| 102 | + }, |
| 103 | + |
| 104 | + function _search(done) { |
| 105 | + console.log('Searching all documents.'); |
| 106 | + var query = solrClient.createQuery(); |
| 107 | + // This query searches for the term 'psychological' in the content_text field. |
| 108 | + // For a wildcard query use: |
| 109 | + // query.q({ '*' : '*' }); |
| 110 | + query.q({ |
| 111 | + 'content_text': 'psychological' |
| 112 | + }); |
| 113 | + |
| 114 | + solrClient.search(query, function(err, searchResponse) { |
| 115 | + if (err) { |
| 116 | + console.log('Error searching for documents: ' + err); |
| 117 | + } else { |
| 118 | + console.log('Found ' + searchResponse.response.numFound + ' document(s).'); |
| 119 | + console.log('First document: ' + JSON.stringify(searchResponse.response.docs[0], null, 2)); |
| 120 | + } |
| 121 | + done(); |
| 122 | + }); |
| 123 | + } |
| 124 | +]); |
| 125 | + |
| 126 | +function mapAnswerUnits2SolrDocs(data) { |
| 127 | + var answerUnits = data.answer_units; |
| 128 | + var solrDocList = []; |
| 129 | + answerUnits.forEach(function(value) { |
| 130 | + var solrDoc = convertAnswerUnit2SolrDoc(value); |
| 131 | + solrDocList.push(solrDoc); |
| 132 | + }); |
| 133 | + return solrDocList; |
| 134 | +} |
| 135 | + |
| 136 | +function convertAnswerUnit2SolrDoc(au) { |
| 137 | + var solrDoc; |
| 138 | + var auContents = au.content; |
| 139 | + auContents.forEach(function(auContent) { |
| 140 | + if (auContent.media_type === 'text/plain') { |
| 141 | + solrDoc = { |
| 142 | + id: au.id, |
| 143 | + title: au.title, |
| 144 | + type: au.type, |
| 145 | + media_type: auContent.media_type, |
| 146 | + content_text: auContent.text |
| 147 | + }; |
| 148 | + } |
| 149 | + }); |
| 150 | + return solrDoc; |
| 151 | +} |
0 commit comments