Skip to content

Commit 476af01

Browse files
committed
fix selectors, add docker and pip packages
1 parent f823795 commit 476af01

File tree

5 files changed

+213
-0
lines changed

5 files changed

+213
-0
lines changed

console.es

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
2+
// GET search?q=name:salads/_doc/
3+
4+
POST recipes/_search
5+
{
6+
"query": {
7+
"match": {
8+
"phrase": {
9+
"query" : "Salad"
10+
}
11+
}
12+
}
13+
}
14+
15+
16+
POST recipes/_search

fetch_recipes_fix.py

Lines changed: 188 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,188 @@
1+
import json
2+
import logging
3+
from pprint import pprint
4+
from time import sleep
5+
6+
import requests
7+
from bs4 import BeautifulSoup
8+
from elasticsearch import Elasticsearch
9+
10+
11+
def search(es_object, index_name, search):
12+
res = es_object.search(index=index_name, body=search)
13+
pprint(res)
14+
15+
16+
def create_index(es_object, index_name):
17+
created = False
18+
# index settings
19+
settings = {
20+
"settings": {
21+
"number_of_shards": 1,
22+
"number_of_replicas": 0
23+
},
24+
"mappings": {
25+
"salads": {
26+
"dynamic": "strict",
27+
"properties": {
28+
"title": {
29+
"type": "text"
30+
},
31+
"submitter": {
32+
"type": "text"
33+
},
34+
"description": {
35+
"type": "text"
36+
},
37+
"calories": {
38+
"type": "integer"
39+
},
40+
"ingredients": {
41+
"type": "nested",
42+
"properties": {
43+
"step": {"type": "text"}
44+
}
45+
},
46+
}
47+
}
48+
}
49+
}
50+
51+
try:
52+
if not es_object.indices.exists(index_name):
53+
# Ignore 400 means to ignore "Index Already Exist" error.
54+
es_object.indices.create(index=index_name, ignore=400, body=settings)
55+
print('Created Index')
56+
created = True
57+
except Exception as ex:
58+
print(str(ex))
59+
finally:
60+
return created
61+
62+
63+
def store_record(elastic_object, index_name, record):
64+
is_stored = True
65+
try:
66+
outcome = elastic_object.index(index=index_name, doc_type='salads', body=record)
67+
print(outcome)
68+
except Exception as ex:
69+
print('Error in indexing data')
70+
print(str(ex))
71+
is_stored = False
72+
finally:
73+
return is_stored
74+
75+
76+
def connect_elasticsearch():
77+
_es = None
78+
_es = Elasticsearch([{'host': 'localhost', 'port': 9200}])
79+
if _es.ping():
80+
print('Yay Connected')
81+
else:
82+
print('Awww it could not connect!')
83+
return _es
84+
85+
86+
def parse(u):
87+
title = '-'
88+
submit_by = '-'
89+
description = '-'
90+
calories = 0
91+
ingredients = []
92+
rec = {}
93+
94+
try:
95+
r = requests.get(u, headers=headers)
96+
97+
if r.status_code == 200:
98+
html = r.text
99+
soup = BeautifulSoup(html, 'lxml')
100+
101+
# # title
102+
# title_section = soup.select('.recipe-summary__h1')
103+
# # submitter
104+
# submitter_section = soup.select('.submitter__name')
105+
# # description
106+
# description_section = soup.select('.submitter__description')
107+
# # ingredients
108+
# ingredients_section = soup.select('.recipe-ingred_txt')
109+
# # calories
110+
# calories_section = soup.select('.calorie-count')
111+
112+
113+
# title
114+
title_section = soup.select('h1.headline.heading-content')
115+
# submitter
116+
submitter_section = soup.select('a.author-name.link')
117+
# description
118+
description_section = soup.select('p.margin-0-auto')
119+
# ingredients
120+
ingredients_section = soup.select('span.ingredients-item-name')
121+
# calories
122+
calories_section = soup.select('div.section-body')
123+
124+
125+
if calories_section:
126+
calories = calories_section[0].text.replace('cals', '').strip()
127+
128+
if ingredients_section:
129+
for ingredient in ingredients_section:
130+
ingredient_text = ingredient.text.strip()
131+
if 'Add all ingredients to list' not in ingredient_text and ingredient_text != '':
132+
ingredients.append({'step': ingredient.text.strip()})
133+
134+
if description_section:
135+
description = description_section[0].text.strip().replace('"', '')
136+
137+
if submitter_section:
138+
submit_by = submitter_section[0].text.strip()
139+
140+
if title_section:
141+
title = title_section[0].text
142+
143+
rec = {'title': title, 'submitter': submit_by, 'description': description, 'calories': calories,
144+
'ingredients': ingredients}
145+
except Exception as ex:
146+
print('Exception while parsing')
147+
print(str(ex))
148+
finally:
149+
return json.dumps(rec)
150+
151+
152+
if __name__ == '__main__':
153+
headers = {
154+
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.181 Safari/537.36',
155+
'Pragma': 'no-cache'
156+
}
157+
logging.basicConfig(level=logging.ERROR)
158+
159+
url = 'https://www.allrecipes.com/recipes/96/salad/'
160+
r = requests.get(url, headers=headers)
161+
if r.status_code == 200:
162+
html = r.text
163+
soup = BeautifulSoup(html, 'lxml')
164+
# links = soup.select('.fixed-recipe-card__h3 a')
165+
links = soup.select('a.card__titleLink')
166+
167+
if len(links) > 0:
168+
es = connect_elasticsearch()
169+
170+
171+
# for link in links:
172+
for i,link in enumerate(links):
173+
# if i==3:
174+
# break
175+
176+
sleep(2)
177+
result = parse(link['href'])
178+
if es is not None:
179+
if create_index(es, 'recipes'):
180+
out = store_record(es, 'recipes', result)
181+
print('Data indexed successfully')
182+
183+
es = connect_elasticsearch()
184+
if es is not None:
185+
# search_object = {'query': {'match': {'calories': '102'}}}
186+
# search_object = {'_source': ['title'], 'query': {'match': {'calories': '102'}}}
187+
search_object = {'_source': ['title'], 'query': {'range': {'calories': {'gte': 20}}}}
188+
search(es, 'recipes', json.dumps(search_object))

install_docker.bat

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
2+
docker pull docker.elastic.co/elasticsearch/elasticsearch:7.11.1
3+
4+
@REM docker run -p 9200:9200 -p 9300:9300 -e "discovery.type=single-node" docker.elastic.co/elasticsearch/elasticsearch:7.11.1
5+
6+
docker run --name es -p 9200:9200 -p 9300:9300 -e "discovery.type=single-node" docker.elastic.co/elasticsearch/elasticsearch:7.11.1

install_venv.bat

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
virtualenv venv
2+
venv\scripts\activate
3+
pip install -r requirements.txt

requirements.txt

420 Bytes
Binary file not shown.

0 commit comments

Comments
 (0)