Skip to content

Commit f823795

Browse files
committed
Initial Commit
1 parent 804a826 commit f823795

File tree

2 files changed

+170
-1
lines changed

2 files changed

+170
-1
lines changed

README.md

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,4 @@
11
# Python-Elasticsearch
2-
An example program that scrapes data from AllRecipes.com and store in Elasticsearch
2+
An example program that scrapes data from AllRecipes.com and store in Elasticsearch.
3+
4+
This code is the part of the blog post: [Getting started with Elasticsearch in Python](http://blog.adnansiddiqi.me/getting-started-with-elasticsearch-in-python/)

fetch_recipes.py

Lines changed: 167 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,167 @@
1+
import json
2+
import logging
3+
from pprint import pprint
4+
from time import sleep
5+
6+
import requests
7+
from bs4 import BeautifulSoup
8+
from elasticsearch import Elasticsearch
9+
10+
11+
def search(es_object, index_name, search):
12+
res = es_object.search(index=index_name, body=search)
13+
pprint(res)
14+
15+
16+
def create_index(es_object, index_name):
17+
created = False
18+
# index settings
19+
settings = {
20+
"settings": {
21+
"number_of_shards": 1,
22+
"number_of_replicas": 0
23+
},
24+
"mappings": {
25+
"salads": {
26+
"dynamic": "strict",
27+
"properties": {
28+
"title": {
29+
"type": "text"
30+
},
31+
"submitter": {
32+
"type": "text"
33+
},
34+
"description": {
35+
"type": "text"
36+
},
37+
"calories": {
38+
"type": "integer"
39+
},
40+
"ingredients": {
41+
"type": "nested",
42+
"properties": {
43+
"step": {"type": "text"}
44+
}
45+
},
46+
}
47+
}
48+
}
49+
}
50+
51+
try:
52+
if not es_object.indices.exists(index_name):
53+
# Ignore 400 means to ignore "Index Already Exist" error.
54+
es_object.indices.create(index=index_name, ignore=400, body=settings)
55+
print('Created Index')
56+
created = True
57+
except Exception as ex:
58+
print(str(ex))
59+
finally:
60+
return created
61+
62+
63+
def store_record(elastic_object, index_name, record):
64+
is_stored = True
65+
try:
66+
outcome = elastic_object.index(index=index_name, doc_type='salads', body=record)
67+
print(outcome)
68+
except Exception as ex:
69+
print('Error in indexing data')
70+
print(str(ex))
71+
is_stored = False
72+
finally:
73+
return is_stored
74+
75+
76+
def connect_elasticsearch():
77+
_es = None
78+
_es = Elasticsearch([{'host': 'localhost', 'port': 9200}])
79+
if _es.ping():
80+
print('Yay Connected')
81+
else:
82+
print('Awww it could not connect!')
83+
return _es
84+
85+
86+
def parse(u):
87+
title = '-'
88+
submit_by = '-'
89+
description = '-'
90+
calories = 0
91+
ingredients = []
92+
rec = {}
93+
94+
try:
95+
r = requests.get(u, headers=headers)
96+
97+
if r.status_code == 200:
98+
html = r.text
99+
soup = BeautifulSoup(html, 'lxml')
100+
# title
101+
title_section = soup.select('.recipe-summary__h1')
102+
# submitter
103+
submitter_section = soup.select('.submitter__name')
104+
# description
105+
description_section = soup.select('.submitter__description')
106+
# ingredients
107+
ingredients_section = soup.select('.recipe-ingred_txt')
108+
109+
# calories
110+
calories_section = soup.select('.calorie-count')
111+
if calories_section:
112+
calories = calories_section[0].text.replace('cals', '').strip()
113+
114+
if ingredients_section:
115+
for ingredient in ingredients_section:
116+
ingredient_text = ingredient.text.strip()
117+
if 'Add all ingredients to list' not in ingredient_text and ingredient_text != '':
118+
ingredients.append({'step': ingredient.text.strip()})
119+
120+
if description_section:
121+
description = description_section[0].text.strip().replace('"', '')
122+
123+
if submitter_section:
124+
submit_by = submitter_section[0].text.strip()
125+
126+
if title_section:
127+
title = title_section[0].text
128+
129+
rec = {'title': title, 'submitter': submit_by, 'description': description, 'calories': calories,
130+
'ingredients': ingredients}
131+
except Exception as ex:
132+
print('Exception while parsing')
133+
print(str(ex))
134+
finally:
135+
return json.dumps(rec)
136+
137+
138+
if __name__ == '__main__':
139+
headers = {
140+
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.181 Safari/537.36',
141+
'Pragma': 'no-cache'
142+
}
143+
logging.basicConfig(level=logging.ERROR)
144+
145+
url = 'https://www.allrecipes.com/recipes/96/salad/'
146+
r = requests.get(url, headers=headers)
147+
if r.status_code == 200:
148+
html = r.text
149+
soup = BeautifulSoup(html, 'lxml')
150+
links = soup.select('.fixed-recipe-card__h3 a')
151+
if len(links) > 0:
152+
es = connect_elasticsearch()
153+
154+
for link in links:
155+
sleep(2)
156+
result = parse(link['href'])
157+
if es is not None:
158+
if create_index(es, 'recipes'):
159+
out = store_record(es, 'recipes', result)
160+
print('Data indexed successfully')
161+
162+
es = connect_elasticsearch()
163+
if es is not None:
164+
# search_object = {'query': {'match': {'calories': '102'}}}
165+
# search_object = {'_source': ['title'], 'query': {'match': {'calories': '102'}}}
166+
search_object = {'_source': ['title'], 'query': {'range': {'calories': {'gte': 20}}}}
167+
search(es, 'recipes', json.dumps(search_object))

0 commit comments

Comments
 (0)