|
| 1 | +{ |
| 2 | + "cells": [ |
| 3 | + { |
| 4 | + "cell_type": "code", |
| 5 | + "execution_count": 1, |
| 6 | + "metadata": { |
| 7 | + "scrolled": true |
| 8 | + }, |
| 9 | + "outputs": [ |
| 10 | + { |
| 11 | + "name": "stderr", |
| 12 | + "output_type": "stream", |
| 13 | + "text": [ |
| 14 | + "[nltk_data] Downloading package maxent_treebank_pos_tagger to\n", |
| 15 | + "[nltk_data] /home/jovyan/nltk_data...\n", |
| 16 | + "[nltk_data] Package maxent_treebank_pos_tagger is already up-to-\n", |
| 17 | + "[nltk_data] date!\n", |
| 18 | + "[nltk_data] Downloading package averaged_perceptron_tagger to\n", |
| 19 | + "[nltk_data] /home/jovyan/nltk_data...\n", |
| 20 | + "[nltk_data] Package averaged_perceptron_tagger is already up-to-\n", |
| 21 | + "[nltk_data] date!\n" |
| 22 | + ] |
| 23 | + } |
| 24 | + ], |
| 25 | + "source": [ |
| 26 | + "###################################################\n", |
| 27 | + "# Tool to Get Q-A Pairs from Support Page #\n", |
| 28 | + "###################################################\n", |
| 29 | + "# #\n", |
| 30 | + "# #\n", |
| 31 | + "# AbderRahman N. Sobh - 12/27/2018 #\n", |
| 32 | + "# All Rights Reserved. #\n", |
| 33 | + "###################################################\n", |
| 34 | + "\n", |
| 35 | + "# Required Packages, install if anything is missing in your environment.\n", |
| 36 | + "\n", |
| 37 | + "#!pip install gensim\n", |
| 38 | + "#!pip install nltk\n", |
| 39 | + "#!pip install pandas\n", |
| 40 | + "#!pip install matplotlib\n", |
| 41 | + "#!pip install beautifulsoup4\n", |
| 42 | + "\n", |
| 43 | + "# Package Imports\n", |
| 44 | + "import pandas as pd\n", |
| 45 | + "import numpy as np\n", |
| 46 | + "import matplotlib.pyplot as plt\n", |
| 47 | + "\n", |
| 48 | + "import gensim\n", |
| 49 | + "from gensim.utils import simple_preprocess\n", |
| 50 | + "from gensim.parsing.preprocessing import STOPWORDS\n", |
| 51 | + "from gensim.summarization.textcleaner import get_sentences\n", |
| 52 | + "\n", |
| 53 | + "from gensim.models import TfidfModel\n", |
| 54 | + "from gensim.models.coherencemodel import CoherenceModel\n", |
| 55 | + "\n", |
| 56 | + "import nltk\n", |
| 57 | + "nltk.download('maxent_treebank_pos_tagger')\n", |
| 58 | + "nltk.download('averaged_perceptron_tagger')\n", |
| 59 | + "\n", |
| 60 | + "import urllib.request\n", |
| 61 | + "from bs4 import BeautifulSoup" |
| 62 | + ] |
| 63 | + }, |
| 64 | + { |
| 65 | + "cell_type": "code", |
| 66 | + "execution_count": 2, |
| 67 | + "metadata": {}, |
| 68 | + "outputs": [], |
| 69 | + "source": [ |
| 70 | + "#################################################\n", |
| 71 | + "# Parse all the Q-A URLs from the Support Pages #\n", |
| 72 | + "#################################################\n", |
| 73 | + "def get_qapages():\n", |
| 74 | + " \n", |
| 75 | + " # Parse the support page for sub-links\n", |
| 76 | + " support_page = urllib.request.urlopen('URL HERE')\n", |
| 77 | + " soup = BeautifulSoup(support_page, 'html.parser')\n", |
| 78 | + " sublinks = []\n", |
| 79 | + " for i in soup.find_all('a', class_='mt-listing-detailed-subpage-title internal'):\n", |
| 80 | + " sublinks+=[i['href']]\n", |
| 81 | + "\n", |
| 82 | + " # Parse sublinks for sub-sublinks\n", |
| 83 | + " subsublinks = []\n", |
| 84 | + " for pg in sublinks:\n", |
| 85 | + " page = urllib.request.urlopen(pg)\n", |
| 86 | + " soup = BeautifulSoup(page, 'html.parser')\n", |
| 87 | + " for i in soup.find_all('a', class_='mt-listing-detailed-subpage-title internal'):\n", |
| 88 | + " subsublinks+=[i['href']]\n", |
| 89 | + "\n", |
| 90 | + " # Parse sub-sublinks for actual Question-Answer pages\n", |
| 91 | + " qapages = []\n", |
| 92 | + " for link in subsublinks:\n", |
| 93 | + " page = urllib.request.urlopen(link)\n", |
| 94 | + " soup = BeautifulSoup(page, 'html.parser')\n", |
| 95 | + " for i in soup.find_all('a', class_='mt-listing-detailed-subpage-title internal'):\n", |
| 96 | + " qapages+=[i['href']]\n", |
| 97 | + " \n", |
| 98 | + " return qapages" |
| 99 | + ] |
| 100 | + }, |
| 101 | + { |
| 102 | + "cell_type": "code", |
| 103 | + "execution_count": 3, |
| 104 | + "metadata": {}, |
| 105 | + "outputs": [], |
| 106 | + "source": [ |
| 107 | + "#######################################################################\n", |
| 108 | + "# Extract Question (Title) and Answer (Full Text) Pairs from each URL #\n", |
| 109 | + "#######################################################################\n", |
| 110 | + "def get_qapairs(qapages):\n", |
| 111 | + " \n", |
| 112 | + " qapairs = []\n", |
| 113 | + "\n", |
| 114 | + " for qapg in qapages:\n", |
| 115 | + " page = urllib.request.urlopen(qapg)\n", |
| 116 | + " soup = BeautifulSoup(page, 'html.parser')\n", |
| 117 | + "\n", |
| 118 | + " title = soup.find_all('h1', {'id' : 'title'})[0].contents[0].strip()\n", |
| 119 | + " fulltext = ''\n", |
| 120 | + " for block in soup.find_all('div', {'class' : 'mt-section'}):\n", |
| 121 | + " fulltext += ' {}'.format(block.text)\n", |
| 122 | + "\n", |
| 123 | + " qapairs += [(title, fulltext.strip(), qapg)]\n", |
| 124 | + " \n", |
| 125 | + " return qapairs" |
| 126 | + ] |
| 127 | + }, |
| 128 | + { |
| 129 | + "cell_type": "code", |
| 130 | + "execution_count": 4, |
| 131 | + "metadata": {}, |
| 132 | + "outputs": [], |
| 133 | + "source": [ |
| 134 | + "############################################################################################\n", |
| 135 | + "# Extract NLP data structures from any specified text column (i.e. 'Question' or 'Answer') #\n", |
| 136 | + "############################################################################################\n", |
| 137 | + "def tokenize(text):\n", |
| 138 | + " return [token for token in simple_preprocess(text) if token not in STOPWORDS]\n", |
| 139 | + "\n", |
| 140 | + "def get_NLP(df, column):\n", |
| 141 | + "\n", |
| 142 | + " # Extract all the data structures we need from the initial data: sentences, tokens, dictionaries, and corpuses\n", |
| 143 | + " df['tokens'] = df[column].apply(lambda x: tokenize(x))\n", |
| 144 | + " common_dictionary = gensim.corpora.Dictionary(df['tokens'])\n", |
| 145 | + " common_corpus = [common_dictionary.doc2bow(text) for text in df['tokens']]\n", |
| 146 | + " df['corpus'] = common_corpus\n", |
| 147 | + " df['pos_tags'] = df['tokens'].apply(lambda x: nltk.pos_tag(x))\n", |
| 148 | + " \n", |
| 149 | + " # Get a TFIDF Model and apply the scores to the words in each entry of the text column\n", |
| 150 | + " tmodel = TfidfModel(corpus=common_corpus)\n", |
| 151 | + " df['tfidf_wordscores'] = df['corpus'].apply(lambda x: [(common_dictionary.get(a),b) for (a,b) in tmodel[x]])\n", |
| 152 | + " \n", |
| 153 | + " icols = ['tokens','pos_tags', 'corpus','tfidf_wordscores']\n", |
| 154 | + " jcols = [item+'_'+column for item in icols]\n", |
| 155 | + " df = df.rename(columns=dict(zip(icols, jcols)))\n", |
| 156 | + " \n", |
| 157 | + " return df, common_dictionary, common_corpus\n", |
| 158 | + " " |
| 159 | + ] |
| 160 | + }, |
| 161 | + { |
| 162 | + "cell_type": "code", |
| 163 | + "execution_count": 5, |
| 164 | + "metadata": {}, |
| 165 | + "outputs": [], |
| 166 | + "source": [ |
| 167 | + "##################################################################\n", |
| 168 | + "# Investigate Coherence Scores by iteratively fitting LDA models #\n", |
| 169 | + "##################################################################\n", |
| 170 | + "\n", |
| 171 | + "# Coherence models are used to determine roughly how many topics we expect are present in the dataset\n", |
| 172 | + "# This is an exploratory portion of the code which requires human inference to select the best option.\n", |
| 173 | + "# Though, it is possible to score a maximization between both metrics as well.\n", |
| 174 | + "\n", |
| 175 | + "# Note that this portion of the code essentially creates a number of models with different topic selection counts\n", |
| 176 | + "# and stores ALL of them as an array, allowing for easy testing of different model types on the fly.\n", |
| 177 | + "# This is resource intensive!\n", |
| 178 | + "\n", |
| 179 | + "def compute_coherence_values(dictionary, corpus,texts, start, limit, step, ctype):\n", |
| 180 | + " coherence_values = []\n", |
| 181 | + " model_list = []\n", |
| 182 | + " \n", |
| 183 | + " for num_topics in range(start, limit, step):\n", |
| 184 | + " model= gensim.models.ldamodel.LdaModel(corpus=corpus, id2word=dictionary,num_topics=num_topics, random_state=7)\n", |
| 185 | + " model_list.append(model)\n", |
| 186 | + " cm = CoherenceModel(model=model, dictionary=dictionary, corpus=corpus, texts=texts, coherence=ctype)\n", |
| 187 | + " coherence_values.append(cm.get_coherence())\n", |
| 188 | + " \n", |
| 189 | + " return model_list, coherence_values\n", |
| 190 | + "\n", |
| 191 | + "\n", |
| 192 | + "# Left off for the sake of time ... going off of human observation this time. See plotting function below.\n", |
| 193 | + "#def maximize_coherence(cvals1, cvals2):\n", |
| 194 | + "\n", |
| 195 | + "# Plots of Standard scaled coherence values, their sum. \n", |
| 196 | + "# Optimal topic selection should perform well across both performance metrics.\n", |
| 197 | + "def plot_coherence(dictionary, corpus, tokens):\n", |
| 198 | + " # Compute u_mass and c_v coherence scores to compare results\n", |
| 199 | + " step = 1\n", |
| 200 | + " model_list1, coherence_values1 = compute_coherence_values(\n", |
| 201 | + " dictionary=dictionary, corpus=corpus, texts=None, start=2, limit=40, step=1, ctype='u_mass')\n", |
| 202 | + "\n", |
| 203 | + " model_list2, coherence_values2 = compute_coherence_values(\n", |
| 204 | + " dictionary=dictionary, corpus=corpus, texts=tokens, start=2, limit=40, step=1, ctype='c_v')\n", |
| 205 | + "\n", |
| 206 | + " # Put both metrics on the same scale for observation\n", |
| 207 | + " from sklearn.preprocessing import StandardScaler\n", |
| 208 | + "\n", |
| 209 | + " scaler = StandardScaler()\n", |
| 210 | + " scaler.fit(np.array(coherence_values1).reshape(-1,1))\n", |
| 211 | + " cvn1 = scaler.transform(np.array(coherence_values1).reshape(-1,1))\n", |
| 212 | + " scaler.fit(np.array(coherence_values2).reshape(-1,1))\n", |
| 213 | + " cvn2 = scaler.transform(np.array(coherence_values2).reshape(-1,1))\n", |
| 214 | + "\n", |
| 215 | + "\n", |
| 216 | + " # Plot coherence scores\n", |
| 217 | + " limit=40; start=2; step=1;\n", |
| 218 | + " x = range(start, limit, step)\n", |
| 219 | + " plt.figure(figsize=(10,5))\n", |
| 220 | + " plt.grid()\n", |
| 221 | + " plt.plot(x, cvn1)\n", |
| 222 | + " plt.xticks(np.arange(start, limit, step=step))\n", |
| 223 | + " plt.xlabel(\"Num Topics\")\n", |
| 224 | + " plt.ylabel(\"Coherence score (u_mass)\")\n", |
| 225 | + " plt.legend((\"coherence_values\"), loc='best')\n", |
| 226 | + " plt.show()\n", |
| 227 | + "\n", |
| 228 | + " plt.figure(figsize=(10,5))\n", |
| 229 | + " plt.grid()\n", |
| 230 | + " plt.plot(x, cvn2)\n", |
| 231 | + " plt.xticks(np.arange(start, limit, step=step))\n", |
| 232 | + " plt.xlabel(\"Num Topics\")\n", |
| 233 | + " plt.ylabel(\"Coherence score (c_v)\")\n", |
| 234 | + " plt.legend((\"coherence_values\"), loc='best')\n", |
| 235 | + " plt.show()\n", |
| 236 | + " \n", |
| 237 | + " plt.figure(figsize=(10,5))\n", |
| 238 | + " plt.grid()\n", |
| 239 | + " plt.plot(x, cvn1+cvn2)\n", |
| 240 | + " plt.xticks(np.arange(start, limit, step=step))\n", |
| 241 | + " plt.xlabel(\"Num Topics\")\n", |
| 242 | + " plt.ylabel(\"Sum of Metrics\")\n", |
| 243 | + " plt.legend((\"coherence_values\"), loc='best')\n", |
| 244 | + " plt.show()\n", |
| 245 | + " " |
| 246 | + ] |
| 247 | + }, |
| 248 | + { |
| 249 | + "cell_type": "code", |
| 250 | + "execution_count": 6, |
| 251 | + "metadata": {}, |
| 252 | + "outputs": [], |
| 253 | + "source": [ |
| 254 | + "##############################################\n", |
| 255 | + "# The main function which controls this tool #\n", |
| 256 | + "##############################################\n", |
| 257 | + "\n", |
| 258 | + "def main():\n", |
| 259 | + " \n", |
| 260 | + " # Generate the DataFrame\n", |
| 261 | + " qapages = get_qapages()\n", |
| 262 | + " qapairs = get_qapairs(qapages)\n", |
| 263 | + " df = pd.DataFrame(qapairs)\n", |
| 264 | + " df.columns = ['Question', 'Answer', 'URL']\n", |
| 265 | + "\n", |
| 266 | + " # Non-specific schema components\n", |
| 267 | + " df[''] = ''\n", |
| 268 | + " df['pre-processing'] = \"Convert to lowercase, remove tokens < 2 chars or > 15 chars, remove stopwords (Stone, Denis, Kwantes (2010)), split on whitespace\"\n", |
| 269 | + "\n", |
| 270 | + " # Generate the full schema and fill with values\n", |
| 271 | + " cdict = {}\n", |
| 272 | + " ccorpus = {}\n", |
| 273 | + " LDAmodel = {}\n", |
| 274 | + " model_topics = {}\n", |
| 275 | + " col_list = ['Question', 'Answer', '','pre-processing']\n", |
| 276 | + " ce = ['bow', 'pos_tags', 'keywords', 'context', 'LDA', 'topic/intent', 'topic_key']\n", |
| 277 | + "\n", |
| 278 | + " text_to_use = ['Question', 'Answer'] \n", |
| 279 | + "\n", |
| 280 | + " for entry in text_to_use:\n", |
| 281 | + " # Enrich with NLP features, build Corpora and Dictionaries\n", |
| 282 | + " df, cdict[entry], ccorpus[entry] = get_NLP(df, entry)\n", |
| 283 | + "\n", |
| 284 | + " # If in notebook, consider generating the plots for observation:\n", |
| 285 | + " #%matplotlib inline\n", |
| 286 | + " #print('{} LDA Coherence Plots'.format(entry))\n", |
| 287 | + " #plot_coherence(cdict[entry],ccorpus[entry], df['tokens_{}'.format(entry)])\n", |
| 288 | + "\n", |
| 289 | + " # Fit and select the best LDA Models\n", |
| 290 | + " # Num of topics was determined by observation, though should be replaced with a good optimization\n", |
| 291 | + " LDAmodel[entry] = gensim.models.ldamodel.LdaModel(\n", |
| 292 | + " corpus=ccorpus[entry], id2word=cdict[entry], num_topics=9, random_state=7)\n", |
| 293 | + "\n", |
| 294 | + " # Apply LDA to all documents\n", |
| 295 | + " df['LDA_TopicPresence_{}'.format(entry)] = df['corpus_{}'.format(entry)].apply(lambda x: LDAmodel[entry][x])\n", |
| 296 | + " # Apply a threshold to topic presence for final interpretations, here I use 10%\n", |
| 297 | + " df['topic/intent_{}'.format(entry)] = df['LDA_TopicPresence_{}'.format(entry)].apply(lambda x: [i for i,v in x if v > 0.1])\n", |
| 298 | + " model_topics[entry] = [LDAmodel[entry].show_topic(n) for n in range(0,9)]\n", |
| 299 | + " df['topic_key_{}'.format(entry)] = ''\n", |
| 300 | + " df['topic_key_{}'.format(entry)][0] = model_topics[entry]\n", |
| 301 | + "\n", |
| 302 | + " \n", |
| 303 | + " # Apply expected schema components\n", |
| 304 | + " df['keywords_{}'.format(entry)] = df['tfidf_wordscores_{}'.format(entry)]\n", |
| 305 | + "\n", |
| 306 | + " df['context_{}'.format(entry)] = df['pos_tags_{}'.format(entry)].apply(lambda x: [k for k,v in x if v == 'NN'])\n", |
| 307 | + " df = df.rename(columns={'tokens_{}'.format(entry):'bow_{}'.format(entry),\n", |
| 308 | + " 'LDA_TopicPresence_{}'.format(entry):'LDA_{}'.format(entry)})\n", |
| 309 | + "\n", |
| 310 | + " col_list = col_list + [i+'_{}'.format(entry) for i in ce]\n", |
| 311 | + "\n", |
| 312 | + " fdf = df[col_list] \n", |
| 313 | + " fdf.to_csv('out.csv')\n", |
| 314 | + " return\n" |
| 315 | + ] |
| 316 | + }, |
| 317 | + { |
| 318 | + "cell_type": "code", |
| 319 | + "execution_count": 7, |
| 320 | + "metadata": {}, |
| 321 | + "outputs": [], |
| 322 | + "source": [ |
| 323 | + "main()" |
| 324 | + ] |
| 325 | + }, |
| 326 | + { |
| 327 | + "cell_type": "code", |
| 328 | + "execution_count": null, |
| 329 | + "metadata": {}, |
| 330 | + "outputs": [], |
| 331 | + "source": [] |
| 332 | + } |
| 333 | + ], |
| 334 | + "metadata": { |
| 335 | + "kernelspec": { |
| 336 | + "display_name": "Python 3", |
| 337 | + "language": "python", |
| 338 | + "name": "python3" |
| 339 | + }, |
| 340 | + "language_info": { |
| 341 | + "codemirror_mode": { |
| 342 | + "name": "ipython", |
| 343 | + "version": 3 |
| 344 | + }, |
| 345 | + "file_extension": ".py", |
| 346 | + "mimetype": "text/x-python", |
| 347 | + "name": "python", |
| 348 | + "nbconvert_exporter": "python", |
| 349 | + "pygments_lexer": "ipython3", |
| 350 | + "version": "3.6.6" |
| 351 | + } |
| 352 | + }, |
| 353 | + "nbformat": 4, |
| 354 | + "nbformat_minor": 2 |
| 355 | +} |
0 commit comments