Skip to content

Commit 695e41c

Browse files
authored
Add open source client solutions
1 parent 0fc5158 commit 695e41c

File tree

2 files changed

+677
-0
lines changed

2 files changed

+677
-0
lines changed

NLP_QAparser.ipynb

Lines changed: 355 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,355 @@
1+
{
2+
"cells": [
3+
{
4+
"cell_type": "code",
5+
"execution_count": 1,
6+
"metadata": {
7+
"scrolled": true
8+
},
9+
"outputs": [
10+
{
11+
"name": "stderr",
12+
"output_type": "stream",
13+
"text": [
14+
"[nltk_data] Downloading package maxent_treebank_pos_tagger to\n",
15+
"[nltk_data] /home/jovyan/nltk_data...\n",
16+
"[nltk_data] Package maxent_treebank_pos_tagger is already up-to-\n",
17+
"[nltk_data] date!\n",
18+
"[nltk_data] Downloading package averaged_perceptron_tagger to\n",
19+
"[nltk_data] /home/jovyan/nltk_data...\n",
20+
"[nltk_data] Package averaged_perceptron_tagger is already up-to-\n",
21+
"[nltk_data] date!\n"
22+
]
23+
}
24+
],
25+
"source": [
26+
"###################################################\n",
27+
"# Tool to Get Q-A Pairs from Support Page #\n",
28+
"###################################################\n",
29+
"# #\n",
30+
"# #\n",
31+
"# AbderRahman N. Sobh - 12/27/2018 #\n",
32+
"# All Rights Reserved. #\n",
33+
"###################################################\n",
34+
"\n",
35+
"# Required Packages, install if anything is missing in your environment.\n",
36+
"\n",
37+
"#!pip install gensim\n",
38+
"#!pip install nltk\n",
39+
"#!pip install pandas\n",
40+
"#!pip install matplotlib\n",
41+
"#!pip install beautifulsoup4\n",
42+
"\n",
43+
"# Package Imports\n",
44+
"import pandas as pd\n",
45+
"import numpy as np\n",
46+
"import matplotlib.pyplot as plt\n",
47+
"\n",
48+
"import gensim\n",
49+
"from gensim.utils import simple_preprocess\n",
50+
"from gensim.parsing.preprocessing import STOPWORDS\n",
51+
"from gensim.summarization.textcleaner import get_sentences\n",
52+
"\n",
53+
"from gensim.models import TfidfModel\n",
54+
"from gensim.models.coherencemodel import CoherenceModel\n",
55+
"\n",
56+
"import nltk\n",
57+
"nltk.download('maxent_treebank_pos_tagger')\n",
58+
"nltk.download('averaged_perceptron_tagger')\n",
59+
"\n",
60+
"import urllib.request\n",
61+
"from bs4 import BeautifulSoup"
62+
]
63+
},
64+
{
65+
"cell_type": "code",
66+
"execution_count": 2,
67+
"metadata": {},
68+
"outputs": [],
69+
"source": [
70+
"#################################################\n",
71+
"# Parse all the Q-A URLs from the Support Pages #\n",
72+
"#################################################\n",
73+
"def get_qapages():\n",
74+
" \n",
75+
" # Parse the support page for sub-links\n",
76+
" support_page = urllib.request.urlopen('URL HERE')\n",
77+
" soup = BeautifulSoup(support_page, 'html.parser')\n",
78+
" sublinks = []\n",
79+
" for i in soup.find_all('a', class_='mt-listing-detailed-subpage-title internal'):\n",
80+
" sublinks+=[i['href']]\n",
81+
"\n",
82+
" # Parse sublinks for sub-sublinks\n",
83+
" subsublinks = []\n",
84+
" for pg in sublinks:\n",
85+
" page = urllib.request.urlopen(pg)\n",
86+
" soup = BeautifulSoup(page, 'html.parser')\n",
87+
" for i in soup.find_all('a', class_='mt-listing-detailed-subpage-title internal'):\n",
88+
" subsublinks+=[i['href']]\n",
89+
"\n",
90+
" # Parse sub-sublinks for actual Question-Answer pages\n",
91+
" qapages = []\n",
92+
" for link in subsublinks:\n",
93+
" page = urllib.request.urlopen(link)\n",
94+
" soup = BeautifulSoup(page, 'html.parser')\n",
95+
" for i in soup.find_all('a', class_='mt-listing-detailed-subpage-title internal'):\n",
96+
" qapages+=[i['href']]\n",
97+
" \n",
98+
" return qapages"
99+
]
100+
},
101+
{
102+
"cell_type": "code",
103+
"execution_count": 3,
104+
"metadata": {},
105+
"outputs": [],
106+
"source": [
107+
"#######################################################################\n",
108+
"# Extract Question (Title) and Answer (Full Text) Pairs from each URL #\n",
109+
"#######################################################################\n",
110+
"def get_qapairs(qapages):\n",
111+
" \n",
112+
" qapairs = []\n",
113+
"\n",
114+
" for qapg in qapages:\n",
115+
" page = urllib.request.urlopen(qapg)\n",
116+
" soup = BeautifulSoup(page, 'html.parser')\n",
117+
"\n",
118+
" title = soup.find_all('h1', {'id' : 'title'})[0].contents[0].strip()\n",
119+
" fulltext = ''\n",
120+
" for block in soup.find_all('div', {'class' : 'mt-section'}):\n",
121+
" fulltext += ' {}'.format(block.text)\n",
122+
"\n",
123+
" qapairs += [(title, fulltext.strip(), qapg)]\n",
124+
" \n",
125+
" return qapairs"
126+
]
127+
},
128+
{
129+
"cell_type": "code",
130+
"execution_count": 4,
131+
"metadata": {},
132+
"outputs": [],
133+
"source": [
134+
"############################################################################################\n",
135+
"# Extract NLP data structures from any specified text column (i.e. 'Question' or 'Answer') #\n",
136+
"############################################################################################\n",
137+
"def tokenize(text):\n",
138+
" return [token for token in simple_preprocess(text) if token not in STOPWORDS]\n",
139+
"\n",
140+
"def get_NLP(df, column):\n",
141+
"\n",
142+
" # Extract all the data structures we need from the initial data: sentences, tokens, dictionaries, and corpuses\n",
143+
" df['tokens'] = df[column].apply(lambda x: tokenize(x))\n",
144+
" common_dictionary = gensim.corpora.Dictionary(df['tokens'])\n",
145+
" common_corpus = [common_dictionary.doc2bow(text) for text in df['tokens']]\n",
146+
" df['corpus'] = common_corpus\n",
147+
" df['pos_tags'] = df['tokens'].apply(lambda x: nltk.pos_tag(x))\n",
148+
" \n",
149+
" # Get a TFIDF Model and apply the scores to the words in each entry of the text column\n",
150+
" tmodel = TfidfModel(corpus=common_corpus)\n",
151+
" df['tfidf_wordscores'] = df['corpus'].apply(lambda x: [(common_dictionary.get(a),b) for (a,b) in tmodel[x]])\n",
152+
" \n",
153+
" icols = ['tokens','pos_tags', 'corpus','tfidf_wordscores']\n",
154+
" jcols = [item+'_'+column for item in icols]\n",
155+
" df = df.rename(columns=dict(zip(icols, jcols)))\n",
156+
" \n",
157+
" return df, common_dictionary, common_corpus\n",
158+
" "
159+
]
160+
},
161+
{
162+
"cell_type": "code",
163+
"execution_count": 5,
164+
"metadata": {},
165+
"outputs": [],
166+
"source": [
167+
"##################################################################\n",
168+
"# Investigate Coherence Scores by iteratively fitting LDA models #\n",
169+
"##################################################################\n",
170+
"\n",
171+
"# Coherence models are used to determine roughly how many topics we expect are present in the dataset\n",
172+
"# This is an exploratory portion of the code which requires human inference to select the best option.\n",
173+
"# Though, it is possible to score a maximization between both metrics as well.\n",
174+
"\n",
175+
"# Note that this portion of the code essentially creates a number of models with different topic selection counts\n",
176+
"# and stores ALL of them as an array, allowing for easy testing of different model types on the fly.\n",
177+
"# This is resource intensive!\n",
178+
"\n",
179+
"def compute_coherence_values(dictionary, corpus,texts, start, limit, step, ctype):\n",
180+
" coherence_values = []\n",
181+
" model_list = []\n",
182+
" \n",
183+
" for num_topics in range(start, limit, step):\n",
184+
" model= gensim.models.ldamodel.LdaModel(corpus=corpus, id2word=dictionary,num_topics=num_topics, random_state=7)\n",
185+
" model_list.append(model)\n",
186+
" cm = CoherenceModel(model=model, dictionary=dictionary, corpus=corpus, texts=texts, coherence=ctype)\n",
187+
" coherence_values.append(cm.get_coherence())\n",
188+
" \n",
189+
" return model_list, coherence_values\n",
190+
"\n",
191+
"\n",
192+
"# Left off for the sake of time ... going off of human observation this time. See plotting function below.\n",
193+
"#def maximize_coherence(cvals1, cvals2):\n",
194+
"\n",
195+
"# Plots of Standard scaled coherence values, their sum. \n",
196+
"# Optimal topic selection should perform well across both performance metrics.\n",
197+
"def plot_coherence(dictionary, corpus, tokens):\n",
198+
" # Compute u_mass and c_v coherence scores to compare results\n",
199+
" step = 1\n",
200+
" model_list1, coherence_values1 = compute_coherence_values(\n",
201+
" dictionary=dictionary, corpus=corpus, texts=None, start=2, limit=40, step=1, ctype='u_mass')\n",
202+
"\n",
203+
" model_list2, coherence_values2 = compute_coherence_values(\n",
204+
" dictionary=dictionary, corpus=corpus, texts=tokens, start=2, limit=40, step=1, ctype='c_v')\n",
205+
"\n",
206+
" # Put both metrics on the same scale for observation\n",
207+
" from sklearn.preprocessing import StandardScaler\n",
208+
"\n",
209+
" scaler = StandardScaler()\n",
210+
" scaler.fit(np.array(coherence_values1).reshape(-1,1))\n",
211+
" cvn1 = scaler.transform(np.array(coherence_values1).reshape(-1,1))\n",
212+
" scaler.fit(np.array(coherence_values2).reshape(-1,1))\n",
213+
" cvn2 = scaler.transform(np.array(coherence_values2).reshape(-1,1))\n",
214+
"\n",
215+
"\n",
216+
" # Plot coherence scores\n",
217+
" limit=40; start=2; step=1;\n",
218+
" x = range(start, limit, step)\n",
219+
" plt.figure(figsize=(10,5))\n",
220+
" plt.grid()\n",
221+
" plt.plot(x, cvn1)\n",
222+
" plt.xticks(np.arange(start, limit, step=step))\n",
223+
" plt.xlabel(\"Num Topics\")\n",
224+
" plt.ylabel(\"Coherence score (u_mass)\")\n",
225+
" plt.legend((\"coherence_values\"), loc='best')\n",
226+
" plt.show()\n",
227+
"\n",
228+
" plt.figure(figsize=(10,5))\n",
229+
" plt.grid()\n",
230+
" plt.plot(x, cvn2)\n",
231+
" plt.xticks(np.arange(start, limit, step=step))\n",
232+
" plt.xlabel(\"Num Topics\")\n",
233+
" plt.ylabel(\"Coherence score (c_v)\")\n",
234+
" plt.legend((\"coherence_values\"), loc='best')\n",
235+
" plt.show()\n",
236+
" \n",
237+
" plt.figure(figsize=(10,5))\n",
238+
" plt.grid()\n",
239+
" plt.plot(x, cvn1+cvn2)\n",
240+
" plt.xticks(np.arange(start, limit, step=step))\n",
241+
" plt.xlabel(\"Num Topics\")\n",
242+
" plt.ylabel(\"Sum of Metrics\")\n",
243+
" plt.legend((\"coherence_values\"), loc='best')\n",
244+
" plt.show()\n",
245+
" "
246+
]
247+
},
248+
{
249+
"cell_type": "code",
250+
"execution_count": 6,
251+
"metadata": {},
252+
"outputs": [],
253+
"source": [
254+
"##############################################\n",
255+
"# The main function which controls this tool #\n",
256+
"##############################################\n",
257+
"\n",
258+
"def main():\n",
259+
" \n",
260+
" # Generate the DataFrame\n",
261+
" qapages = get_qapages()\n",
262+
" qapairs = get_qapairs(qapages)\n",
263+
" df = pd.DataFrame(qapairs)\n",
264+
" df.columns = ['Question', 'Answer', 'URL']\n",
265+
"\n",
266+
" # Non-specific schema components\n",
267+
" df[''] = ''\n",
268+
" df['pre-processing'] = \"Convert to lowercase, remove tokens < 2 chars or > 15 chars, remove stopwords (Stone, Denis, Kwantes (2010)), split on whitespace\"\n",
269+
"\n",
270+
" # Generate the full schema and fill with values\n",
271+
" cdict = {}\n",
272+
" ccorpus = {}\n",
273+
" LDAmodel = {}\n",
274+
" model_topics = {}\n",
275+
" col_list = ['Question', 'Answer', '','pre-processing']\n",
276+
" ce = ['bow', 'pos_tags', 'keywords', 'context', 'LDA', 'topic/intent', 'topic_key']\n",
277+
"\n",
278+
" text_to_use = ['Question', 'Answer'] \n",
279+
"\n",
280+
" for entry in text_to_use:\n",
281+
" # Enrich with NLP features, build Corpora and Dictionaries\n",
282+
" df, cdict[entry], ccorpus[entry] = get_NLP(df, entry)\n",
283+
"\n",
284+
" # If in notebook, consider generating the plots for observation:\n",
285+
" #%matplotlib inline\n",
286+
" #print('{} LDA Coherence Plots'.format(entry))\n",
287+
" #plot_coherence(cdict[entry],ccorpus[entry], df['tokens_{}'.format(entry)])\n",
288+
"\n",
289+
" # Fit and select the best LDA Models\n",
290+
" # Num of topics was determined by observation, though should be replaced with a good optimization\n",
291+
" LDAmodel[entry] = gensim.models.ldamodel.LdaModel(\n",
292+
" corpus=ccorpus[entry], id2word=cdict[entry], num_topics=9, random_state=7)\n",
293+
"\n",
294+
" # Apply LDA to all documents\n",
295+
" df['LDA_TopicPresence_{}'.format(entry)] = df['corpus_{}'.format(entry)].apply(lambda x: LDAmodel[entry][x])\n",
296+
" # Apply a threshold to topic presence for final interpretations, here I use 10%\n",
297+
" df['topic/intent_{}'.format(entry)] = df['LDA_TopicPresence_{}'.format(entry)].apply(lambda x: [i for i,v in x if v > 0.1])\n",
298+
" model_topics[entry] = [LDAmodel[entry].show_topic(n) for n in range(0,9)]\n",
299+
" df['topic_key_{}'.format(entry)] = ''\n",
300+
" df['topic_key_{}'.format(entry)][0] = model_topics[entry]\n",
301+
"\n",
302+
" \n",
303+
" # Apply expected schema components\n",
304+
" df['keywords_{}'.format(entry)] = df['tfidf_wordscores_{}'.format(entry)]\n",
305+
"\n",
306+
" df['context_{}'.format(entry)] = df['pos_tags_{}'.format(entry)].apply(lambda x: [k for k,v in x if v == 'NN'])\n",
307+
" df = df.rename(columns={'tokens_{}'.format(entry):'bow_{}'.format(entry),\n",
308+
" 'LDA_TopicPresence_{}'.format(entry):'LDA_{}'.format(entry)})\n",
309+
"\n",
310+
" col_list = col_list + [i+'_{}'.format(entry) for i in ce]\n",
311+
"\n",
312+
" fdf = df[col_list] \n",
313+
" fdf.to_csv('out.csv')\n",
314+
" return\n"
315+
]
316+
},
317+
{
318+
"cell_type": "code",
319+
"execution_count": 7,
320+
"metadata": {},
321+
"outputs": [],
322+
"source": [
323+
"main()"
324+
]
325+
},
326+
{
327+
"cell_type": "code",
328+
"execution_count": null,
329+
"metadata": {},
330+
"outputs": [],
331+
"source": []
332+
}
333+
],
334+
"metadata": {
335+
"kernelspec": {
336+
"display_name": "Python 3",
337+
"language": "python",
338+
"name": "python3"
339+
},
340+
"language_info": {
341+
"codemirror_mode": {
342+
"name": "ipython",
343+
"version": 3
344+
},
345+
"file_extension": ".py",
346+
"mimetype": "text/x-python",
347+
"name": "python",
348+
"nbconvert_exporter": "python",
349+
"pygments_lexer": "ipython3",
350+
"version": "3.6.6"
351+
}
352+
},
353+
"nbformat": 4,
354+
"nbformat_minor": 2
355+
}

0 commit comments

Comments
 (0)