Skip to content

Commit 0977782

Browse files
added yelp cron
1 parent b196466 commit 0977782

File tree

16 files changed

+313
-56
lines changed

16 files changed

+313
-56
lines changed
Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
# package directories
2+
node_modules
3+
jspm_packages
4+
5+
# Serverless directories
6+
.serverless
Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,39 @@
1+
"use strict";
2+
const {
3+
getPage,
4+
parsePage,
5+
saveRatingsToDB,
6+
deployScrapers
7+
} = require("./utils");
8+
9+
module.exports.scrape = (event, context, callback) => {
10+
// 1. fetch yelp page
11+
getPage(event)
12+
// 2. parse the page
13+
.then(page => parsePage(page))
14+
// 3. save ratings data to our db
15+
.then(yelpData => saveRatingsToDB(yelpData, event))
16+
.then(() =>
17+
callback(null, {
18+
statusCode: 200,
19+
body: JSON.stringify({
20+
message: `Scraped ${event}`
21+
})
22+
})
23+
)
24+
.catch(error =>
25+
callback(new Error(`Error scraping ${event}: ${JSON.stringify(error)}`))
26+
);
27+
};
28+
29+
module.exports.launch_scrapers = (event, context, callback) => {
30+
const fakeDBResults = [
31+
"urban-light-at-lacma-los-angeles",
32+
"the-museum-of-contemporary-art-los-angeles",
33+
"the-last-bookstore-los-angeles"
34+
];
35+
36+
fakeDBResults.forEach(businessName => {
37+
deployScrapers(businessName);
38+
});
39+
};
Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,39 @@
1+
const request = require("request-promise");
2+
const AWS = require("aws-sdk");
3+
4+
const list = [
5+
"urban-light-at-lacma-los-angeles",
6+
"the-museum-of-contemporary-art-los-angeles",
7+
"the-last-bookstore-los-angeles"
8+
];
9+
10+
function deployScraper(businessName) {
11+
const lambda = new AWS.Lambda({
12+
region: "us-west-2"
13+
});
14+
15+
const params = {
16+
FunctionName: "yelp-scraper-dev-scrape",
17+
InvocationType: "RequestResponse",
18+
LogType: "Tail",
19+
Payload: JSON.stringify(businessName)
20+
};
21+
22+
return lambda.invoke(params, function(error, data) {
23+
if (error) {
24+
console.error(JSON.stringify(error));
25+
return new Error(`Error scraping: ${JSON.stringify(error)}`);
26+
} else if (data) {
27+
console.log(data);
28+
return JSON.stringify(data);
29+
}
30+
});
31+
}
32+
33+
function swarm(arr) {
34+
arr.forEach(businessName => {
35+
deployScraper(businessName);
36+
});
37+
}
38+
39+
swarm(list);
Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
{
2+
"name": "cron-launcher",
3+
"version": "1.0.0",
4+
"description": "",
5+
"main": "handler.js",
6+
"scripts": {
7+
"test": "echo \"Error: no test specified\" && exit 1"
8+
},
9+
"keywords": [],
10+
"author": "",
11+
"license": "ISC",
12+
"dependencies": {
13+
"aws-sdk": "^2.92.0",
14+
"cheerio": "^1.0.0-rc.2",
15+
"request": "^2.81.0",
16+
"request-promise": "^4.2.1",
17+
"uuid": "^3.1.0"
18+
},
19+
"devDependencies": {
20+
"serverless-offline-scheduler": "^0.3.3"
21+
}
22+
}
Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,40 @@
1+
service: yelp-scraper
2+
3+
plugins:
4+
- serverless-offline-scheduler
5+
6+
provider:
7+
name: aws
8+
runtime: nodejs6.10
9+
region: us-west-2
10+
stage: dev
11+
environment:
12+
DYNAMODB_TABLE: yelp-ratings
13+
iamRoleStatements:
14+
- Effect: "Allow"
15+
Action:
16+
- "lambda:InvokeFunction"
17+
Resource: "*"
18+
- Effect: Allow
19+
Action:
20+
- dynamodb:Query
21+
- dynamodb:Scan
22+
- dynamodb:GetItem
23+
- dynamodb:PutItem
24+
- dynamodb:UpdateItem
25+
- dynamodb:DeleteItem
26+
Resource: "arn:aws:dynamodb:${opt:region, self:provider.region}:*:table/${self:provider.environment.DYNAMODB_TABLE}"
27+
package:
28+
include:
29+
- utils/**
30+
31+
32+
33+
functions:
34+
scrape:
35+
handler: handler.scrape
36+
37+
launch_scrapers:
38+
handler: handler.launch_scrapers
39+
events:
40+
- schedule: rate(1 minute)

2_yelp_scraper_code/20_cron_launcher/utils/deployScrapers.js renamed to 2_yelp_scraper_code/20_ZZZZZcron_launcher/utils/deployScrapers.js

File renamed without changes.
Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
const request = require("request-promise");
2+
3+
module.exports = businessName => {
4+
// https://www.yelp.com/biz/the-last-bookstore-los-angeles
5+
const url = `https://www.yelp.com/biz/${businessName}`;
6+
return request({ method: "GET", url: url });
7+
};
Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
module.exports = {
2+
getPage: require("./getPage"),
3+
parsePage: require("./parsePage"),
4+
saveRatingsToDB: require("./saveRatingsToDB"),
5+
deployScrapers: require("./deployScrapers")
6+
};
Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
const cheerio = require("cheerio");
2+
3+
module.exports = page => {
4+
try {
5+
const $ = cheerio.load(page);
6+
const rating = $(".rating-info .i-stars")
7+
.attr("title")
8+
.trim()
9+
.split(" ")[0];
10+
const reviewCount = $(".rating-info .review-count")
11+
.text()
12+
.trim()
13+
.split(" ")[0];
14+
15+
const yelpData = {
16+
rating,
17+
reviewCount
18+
};
19+
20+
return Promise.resolve(yelpData);
21+
} catch (error) {
22+
return Promise.reject(`Error parsing page: ${JSON.stringify(error)}`);
23+
}
24+
};
Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,29 @@
1+
const uuid = require("uuid");
2+
const AWS = require("aws-sdk");
3+
4+
const dynamoDb = new AWS.DynamoDB.DocumentClient();
5+
6+
module.exports = (yelpData, businessName) => {
7+
const date = JSON.stringify(new Date());
8+
const params = {
9+
TableName: process.env.DYNAMODB_TABLE,
10+
Item: {
11+
id: uuid.v1(),
12+
businessName: businessName,
13+
reviewCount: yelpData.reviewCount,
14+
rating: yelpData.rating,
15+
scrapedAt: date
16+
}
17+
};
18+
19+
dynamoDb.put(params, error => {
20+
if (error) {
21+
console.error(`Error saving data to DynamoDB: ${JSON.stringify(error)}`);
22+
return Promise.reject(
23+
`Error saving data to DynamoDB: ${JSON.stringify(error)}`
24+
);
25+
} else {
26+
return Promise.resolve(params.Item);
27+
}
28+
});
29+
};

0 commit comments

Comments
 (0)