itg-abby
diff --git a/‎NLP_QAparser.ipynb‎
Lines changed: 355 additions & 0 deletions b/‎NLP_QAparser.ipynb‎
Lines changed: 355 additions & 0 deletions
@@ -0,0 +1,355 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "[nltk_data] Downloading package maxent_treebank_pos_tagger to\n",
+      "[nltk_data]     /home/jovyan/nltk_data...\n",
+      "[nltk_data]   Package maxent_treebank_pos_tagger is already up-to-\n",
+      "[nltk_data]       date!\n",
+      "[nltk_data] Downloading package averaged_perceptron_tagger to\n",
+      "[nltk_data]     /home/jovyan/nltk_data...\n",
+      "[nltk_data]   Package averaged_perceptron_tagger is already up-to-\n",
+      "[nltk_data]       date!\n"
+     ]
+    }
+   ],
+   "source": [
+    "###################################################\n",
+    "#  Tool to Get Q-A Pairs from     Support Page    #\n",
+    "###################################################\n",
+    "#                                                 #\n",
+    "#                                                 #\n",
+    "#        AbderRahman N. Sobh - 12/27/2018         #\n",
+    "#               All Rights Reserved.              #\n",
+    "###################################################\n",
+    "\n",
+    "# Required Packages, install if anything is missing in your environment.\n",
+    "\n",
+    "#!pip install gensim\n",
+    "#!pip install nltk\n",
+    "#!pip install pandas\n",
+    "#!pip install matplotlib\n",
+    "#!pip install beautifulsoup4\n",
+    "\n",
+    "# Package Imports\n",
+    "import pandas as pd\n",
+    "import numpy as np\n",
+    "import matplotlib.pyplot as plt\n",
+    "\n",
+    "import gensim\n",
+    "from gensim.utils import simple_preprocess\n",
+    "from gensim.parsing.preprocessing import STOPWORDS\n",
+    "from gensim.summarization.textcleaner import get_sentences\n",
+    "\n",
+    "from gensim.models import TfidfModel\n",
+    "from gensim.models.coherencemodel import CoherenceModel\n",
+    "\n",
+    "import nltk\n",
+    "nltk.download('maxent_treebank_pos_tagger')\n",
+    "nltk.download('averaged_perceptron_tagger')\n",
+    "\n",
+    "import urllib.request\n",
+    "from bs4 import BeautifulSoup"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#################################################\n",
+    "# Parse all the Q-A URLs from the Support Pages #\n",
+    "#################################################\n",
+    "def get_qapages():\n",
+    "    \n",
+    "    # Parse the support page for sub-links\n",
+    "    support_page = urllib.request.urlopen('URL HERE')\n",
+    "    soup = BeautifulSoup(support_page, 'html.parser')\n",
+    "    sublinks = []\n",
+    "    for i in soup.find_all('a', class_='mt-listing-detailed-subpage-title internal'):\n",
+    "        sublinks+=[i['href']]\n",
+    "\n",
+    "    # Parse sublinks for sub-sublinks\n",
+    "    subsublinks = []\n",
+    "    for pg in sublinks:\n",
+    "        page = urllib.request.urlopen(pg)\n",
+    "        soup = BeautifulSoup(page, 'html.parser')\n",
+    "        for i in soup.find_all('a', class_='mt-listing-detailed-subpage-title internal'):\n",
+    "            subsublinks+=[i['href']]\n",
+    "\n",
+    "    # Parse sub-sublinks for actual Question-Answer pages\n",
+    "    qapages = []\n",
+    "    for link in subsublinks:\n",
+    "        page = urllib.request.urlopen(link)\n",
+    "        soup = BeautifulSoup(page, 'html.parser')\n",
+    "        for i in soup.find_all('a', class_='mt-listing-detailed-subpage-title internal'):\n",
+    "            qapages+=[i['href']]\n",
+    "            \n",
+    "    return qapages"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#######################################################################\n",
+    "# Extract Question (Title) and Answer (Full Text) Pairs from each URL #\n",
+    "#######################################################################\n",
+    "def get_qapairs(qapages):\n",
+    "    \n",
+    "    qapairs = []\n",
+    "\n",
+    "    for qapg in qapages:\n",
+    "        page = urllib.request.urlopen(qapg)\n",
+    "        soup = BeautifulSoup(page, 'html.parser')\n",
+    "\n",
+    "        title = soup.find_all('h1', {'id' : 'title'})[0].contents[0].strip()\n",
+    "        fulltext = ''\n",
+    "        for block in soup.find_all('div', {'class' : 'mt-section'}):\n",
+    "            fulltext += ' {}'.format(block.text)\n",
+    "\n",
+    "        qapairs += [(title, fulltext.strip(), qapg)]\n",
+    "        \n",
+    "    return qapairs"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "############################################################################################\n",
+    "# Extract NLP data structures from any specified text column (i.e. 'Question' or 'Answer') #\n",
+    "############################################################################################\n",
+    "def tokenize(text):\n",
+    "    return [token for token in simple_preprocess(text) if token not in STOPWORDS]\n",
+    "\n",
+    "def get_NLP(df, column):\n",
+    "\n",
+    "    # Extract all the data structures we need from the initial data: sentences, tokens, dictionaries, and corpuses\n",
+    "    df['tokens'] = df[column].apply(lambda x: tokenize(x))\n",
+    "    common_dictionary = gensim.corpora.Dictionary(df['tokens'])\n",
+    "    common_corpus = [common_dictionary.doc2bow(text) for text in df['tokens']]\n",
+    "    df['corpus'] = common_corpus\n",
+    "    df['pos_tags'] = df['tokens'].apply(lambda x: nltk.pos_tag(x))\n",
+    "    \n",
+    "    # Get a TFIDF Model and apply the scores to the words in each entry of the text column\n",
+    "    tmodel = TfidfModel(corpus=common_corpus)\n",
+    "    df['tfidf_wordscores'] = df['corpus'].apply(lambda x: [(common_dictionary.get(a),b) for (a,b) in tmodel[x]])\n",
+    "    \n",
+    "    icols = ['tokens','pos_tags', 'corpus','tfidf_wordscores']\n",
+    "    jcols = [item+'_'+column for item in icols]\n",
+    "    df = df.rename(columns=dict(zip(icols, jcols)))\n",
+    "    \n",
+    "    return df, common_dictionary, common_corpus\n",
+    "    "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "##################################################################\n",
+    "# Investigate Coherence Scores by iteratively fitting LDA models #\n",
+    "##################################################################\n",
+    "\n",
+    "# Coherence models are used to determine roughly how many topics we expect are present in the dataset\n",
+    "# This is an exploratory portion of the code which requires human inference to select the best option.\n",
+    "# Though, it is possible to score a maximization between both metrics as well.\n",
+    "\n",
+    "# Note that this portion of the code essentially creates a number of models with different topic selection counts\n",
+    "# and stores ALL of them as an array, allowing for easy testing of different model types on the fly.\n",
+    "# This is resource intensive!\n",
+    "\n",
+    "def compute_coherence_values(dictionary, corpus,texts, start, limit, step, ctype):\n",
+    "    coherence_values = []\n",
+    "    model_list = []\n",
+    "    \n",
+    "    for num_topics in range(start, limit, step):\n",
+    "        model= gensim.models.ldamodel.LdaModel(corpus=corpus, id2word=dictionary,num_topics=num_topics, random_state=7)\n",
+    "        model_list.append(model)\n",
+    "        cm = CoherenceModel(model=model, dictionary=dictionary, corpus=corpus, texts=texts, coherence=ctype)\n",
+    "        coherence_values.append(cm.get_coherence())\n",
+    "            \n",
+    "    return model_list, coherence_values\n",
+    "\n",
+    "\n",
+    "# Left off for the sake of time ... going off of human observation this time. See plotting function below.\n",
+    "#def maximize_coherence(cvals1, cvals2):\n",
+    "\n",
+    "# Plots of Standard scaled coherence values, their sum. \n",
+    "# Optimal topic selection should perform well across both performance metrics.\n",
+    "def plot_coherence(dictionary, corpus, tokens):\n",
+    "    # Compute u_mass and c_v coherence scores to compare results\n",
+    "    step = 1\n",
+    "    model_list1, coherence_values1 = compute_coherence_values(\n",
+    "        dictionary=dictionary, corpus=corpus, texts=None, start=2, limit=40, step=1, ctype='u_mass')\n",
+    "\n",
+    "    model_list2, coherence_values2 = compute_coherence_values(\n",
+    "        dictionary=dictionary, corpus=corpus, texts=tokens, start=2, limit=40, step=1, ctype='c_v')\n",
+    "\n",
+    "    # Put both metrics on the same scale for observation\n",
+    "    from sklearn.preprocessing import StandardScaler\n",
+    "\n",
+    "    scaler = StandardScaler()\n",
+    "    scaler.fit(np.array(coherence_values1).reshape(-1,1))\n",
+    "    cvn1 = scaler.transform(np.array(coherence_values1).reshape(-1,1))\n",
+    "    scaler.fit(np.array(coherence_values2).reshape(-1,1))\n",
+    "    cvn2 = scaler.transform(np.array(coherence_values2).reshape(-1,1))\n",
+    "\n",
+    "\n",
+    "    # Plot coherence scores\n",
+    "    limit=40; start=2; step=1;\n",
+    "    x = range(start, limit, step)\n",
+    "    plt.figure(figsize=(10,5))\n",
+    "    plt.grid()\n",
+    "    plt.plot(x, cvn1)\n",
+    "    plt.xticks(np.arange(start, limit, step=step))\n",
+    "    plt.xlabel(\"Num Topics\")\n",
+    "    plt.ylabel(\"Coherence score (u_mass)\")\n",
+    "    plt.legend((\"coherence_values\"), loc='best')\n",
+    "    plt.show()\n",
+    "\n",
+    "    plt.figure(figsize=(10,5))\n",
+    "    plt.grid()\n",
+    "    plt.plot(x, cvn2)\n",
+    "    plt.xticks(np.arange(start, limit, step=step))\n",
+    "    plt.xlabel(\"Num Topics\")\n",
+    "    plt.ylabel(\"Coherence score (c_v)\")\n",
+    "    plt.legend((\"coherence_values\"), loc='best')\n",
+    "    plt.show()\n",
+    "    \n",
+    "    plt.figure(figsize=(10,5))\n",
+    "    plt.grid()\n",
+    "    plt.plot(x, cvn1+cvn2)\n",
+    "    plt.xticks(np.arange(start, limit, step=step))\n",
+    "    plt.xlabel(\"Num Topics\")\n",
+    "    plt.ylabel(\"Sum of Metrics\")\n",
+    "    plt.legend((\"coherence_values\"), loc='best')\n",
+    "    plt.show()\n",
+    "    "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "##############################################\n",
+    "# The main function which controls this tool #\n",
+    "##############################################\n",
+    "\n",
+    "def main():\n",
+    "    \n",
+    "    # Generate the DataFrame\n",
+    "    qapages = get_qapages()\n",
+    "    qapairs = get_qapairs(qapages)\n",
+    "    df = pd.DataFrame(qapairs)\n",
+    "    df.columns = ['Question', 'Answer', 'URL']\n",
+    "\n",
+    "    # Non-specific schema components\n",
+    "    df[''] = ''\n",
+    "    df['pre-processing'] = \"Convert to lowercase, remove tokens < 2 chars or > 15 chars, remove stopwords (Stone, Denis, Kwantes (2010)), split on whitespace\"\n",
+    "\n",
+    "    # Generate the full schema and fill with values\n",
+    "    cdict = {}\n",
+    "    ccorpus = {}\n",
+    "    LDAmodel = {}\n",
+    "    model_topics = {}\n",
+    "    col_list = ['Question', 'Answer', '','pre-processing']\n",
+    "    ce = ['bow', 'pos_tags', 'keywords', 'context', 'LDA', 'topic/intent', 'topic_key']\n",
+    "\n",
+    "    text_to_use = ['Question', 'Answer']    \n",
+    "\n",
+    "    for entry in text_to_use:\n",
+    "        # Enrich with NLP features, build Corpora and Dictionaries\n",
+    "        df, cdict[entry], ccorpus[entry] = get_NLP(df, entry)\n",
+    "\n",
+    "        # If in notebook, consider generating the plots for observation:\n",
+    "        #%matplotlib inline\n",
+    "        #print('{} LDA Coherence Plots'.format(entry))\n",
+    "        #plot_coherence(cdict[entry],ccorpus[entry], df['tokens_{}'.format(entry)])\n",
+    "\n",
+    "        # Fit and select the best LDA Models\n",
+    "        # Num of topics was determined by observation, though should be replaced with a good optimization\n",
+    "        LDAmodel[entry] = gensim.models.ldamodel.LdaModel(\n",
+    "            corpus=ccorpus[entry], id2word=cdict[entry], num_topics=9, random_state=7)\n",
+    "\n",
+    "        # Apply LDA to all documents\n",
+    "        df['LDA_TopicPresence_{}'.format(entry)] = df['corpus_{}'.format(entry)].apply(lambda x: LDAmodel[entry][x])\n",
+    "        # Apply a threshold to topic presence for final interpretations, here I use 10%\n",
+    "        df['topic/intent_{}'.format(entry)] = df['LDA_TopicPresence_{}'.format(entry)].apply(lambda x: [i for i,v in x if v > 0.1])\n",
+    "        model_topics[entry] = [LDAmodel[entry].show_topic(n) for n in range(0,9)]\n",
+    "        df['topic_key_{}'.format(entry)] = ''\n",
+    "        df['topic_key_{}'.format(entry)][0] = model_topics[entry]\n",
+    "\n",
+    "        \n",
+    "        # Apply expected schema components\n",
+    "        df['keywords_{}'.format(entry)] = df['tfidf_wordscores_{}'.format(entry)]\n",
+    "\n",
+    "        df['context_{}'.format(entry)] = df['pos_tags_{}'.format(entry)].apply(lambda x: [k for k,v in x if v == 'NN'])\n",
+    "        df = df.rename(columns={'tokens_{}'.format(entry):'bow_{}'.format(entry),\n",
+    "                               'LDA_TopicPresence_{}'.format(entry):'LDA_{}'.format(entry)})\n",
+    "\n",
+    "        col_list = col_list + [i+'_{}'.format(entry) for i in ce]\n",
+    "\n",
+    "    fdf = df[col_list]    \n",
+    "    fdf.to_csv('out.csv')\n",
+    "    return\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "main()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.6.6"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}