1
+ import stream , { Transform , Writable } from 'stream' ;
2
+ import fs from 'fs' ;
3
+ import { promisify } from 'util' ;
1
4
import PromiseQueue from 'p-queue' ;
2
5
import logger from './logger.js' ;
3
6
import defaults from './config/defaults.js' ;
4
7
import recursiveSources from './config/recursive-sources.js' ;
5
8
import Resource from './resource.js' ;
6
9
import request from './request.js' ;
7
10
import ResourceHandler from './resource-handler/index.js' ;
11
+
8
12
import {
9
13
SaveResourceToFileSystemPlugin ,
10
14
GenerateFilenameBySiteStructurePlugin ,
@@ -36,6 +40,44 @@ const filenameGeneratorPlugins = {
36
40
bySiteStructure : GenerateFilenameBySiteStructurePlugin
37
41
} ;
38
42
43
+ class AfterResponseTransformStream extends Transform {
44
+ constructor ( options ) {
45
+ super ( options ) ;
46
+ }
47
+
48
+ _transform ( chunk , encoding , callback ) {
49
+ //console.log(chunk.toString().toUpperCase());
50
+ callback ( null , chunk . toString ( ) . toUpperCase ( ) ) ;
51
+ }
52
+ }
53
+
54
+ class HandleChildrenResourcesStream extends Transform {
55
+ constructor ( resource ) {
56
+ super ( ) ;
57
+ this . resouce = resource ;
58
+ }
59
+
60
+ _transform ( chunk , encoding , callback ) {
61
+
62
+ }
63
+ }
64
+
65
+ class SaveResourceWriteStream extends Writable {
66
+ constructor ( resource , dir ) {
67
+ super ( ) ;
68
+ this . stream = null ;
69
+ this . resource = resource ;
70
+ this . directory = dir ;
71
+ }
72
+
73
+ _write ( chunk , encoding , callback ) {
74
+ if ( ! this . stream ) {
75
+ this . stream = fs . createWriteStream ( this . directory + '/' + this . resource . getFilename ( ) ) ;
76
+ }
77
+ this . stream . _write ( chunk , encoding , callback ) ;
78
+ }
79
+ }
80
+
39
81
class Scraper {
40
82
constructor ( options ) {
41
83
this . normalizeOptions ( options ) ;
@@ -49,6 +91,7 @@ class Scraper {
49
91
} ) ;
50
92
this . resources = this . options . urls . map ( ( { url, filename} ) => new Resource ( url , filename ) ) ;
51
93
94
+ this . requestedResourcesStream = new NormalizedUrlMap ( ) ; // Map url -> stream
52
95
this . requestedResourcePromises = new NormalizedUrlMap ( ) ; // Map url -> request promise
53
96
this . loadedResources = new NormalizedUrlMap ( ) ; // Map url -> resource
54
97
this . requestQueue = new PromiseQueue ( { concurrency : this . options . requestConcurrency } ) ;
@@ -137,10 +180,60 @@ class Scraper {
137
180
}
138
181
}
139
182
140
- createNewRequest ( resource ) {
183
+ async getRequestStream ( resource ) {
184
+ const url = resource . getUrl ( ) ;
185
+ const referer = resource . parent ? resource . parent . getUrl ( ) : null ;
186
+
187
+ const { requestOptions} = await this . runActions ( 'beforeRequest' , { resource, requestOptions : this . options . request } ) ;
188
+ return request . get ( {
189
+ url,
190
+ referer,
191
+ options : requestOptions ,
192
+ //afterResponse: this.actions.afterResponse.length ? this.runActions.bind(this, 'afterResponse') : undefined
193
+ } ) ;
194
+ }
195
+
196
+ getSaveStream ( resource ) {
197
+
198
+ }
199
+
200
+ async createNewRequest ( resource ) {
141
201
const self = this ;
142
202
const url = resource . getUrl ( ) ;
143
203
204
+ // read stream stream
205
+ const requestStream = await this . getRequestStream ( resource ) ;
206
+ requestStream . on ( 'response' , async ( response ) => {
207
+ const mimeType = request . getMimeType ( response . headers ) ;
208
+ resource . setType ( getTypeByMime ( mimeType ) ) ;
209
+
210
+ const { filename } = await self . runActions ( 'generateFilename' , { resource } ) ;
211
+ resource . setFilename ( filename ) ;
212
+
213
+ // if type was not determined by mime we can try to get it from filename after it was generated
214
+ if ( ! resource . getType ( ) ) {
215
+ resource . setType ( getTypeByFilename ( filename ) ) ;
216
+ }
217
+ } ) ;
218
+
219
+ // transformers
220
+ const afterResponseTransformStream = new AfterResponseTransformStream ( ) ;
221
+
222
+ // write stream
223
+ fs . mkdirSync ( this . options . directory ) ;
224
+ const saveResourceStream = new SaveResourceWriteStream ( resource , this . options . directory ) ;
225
+
226
+ const pipeline = promisify ( stream . pipeline ) ;
227
+
228
+ await pipeline (
229
+ requestStream ,
230
+ afterResponseTransformStream ,
231
+ // handleChildResourcesStream,
232
+ saveResourceStream
233
+ ) ;
234
+
235
+ //return this.requestQueue.add(requestStream);
236
+
144
237
const requestPromise = Promise . resolve ( )
145
238
. then ( async ( ) => {
146
239
const referer = resource . parent ? resource . parent . getUrl ( ) : null ;
0 commit comments