datatribe-collective-labs
diff --git a/‎VincentLeV/.gitignore‎
Lines changed: 5 additions & 0 deletions b/‎VincentLeV/.gitignore‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎VincentLeV/README.MD‎
Lines changed: 108 additions & 0 deletions b/‎VincentLeV/README.MD‎
Lines changed: 108 additions & 0 deletions
diff --git a/‎VincentLeV/app/Dockerfile‎
Lines changed: 16 additions & 0 deletions b/‎VincentLeV/app/Dockerfile‎
Lines changed: 16 additions & 0 deletions
diff --git a/‎VincentLeV/app/app.py‎
Lines changed: 13 additions & 0 deletions b/‎VincentLeV/app/app.py‎
Lines changed: 13 additions & 0 deletions
diff --git a/‎VincentLeV/app/data/banned_books/banned_books.csv‎
Lines changed: 34258 additions & 0 deletions b/‎VincentLeV/app/data/banned_books/banned_books.csv‎
Lines changed: 34258 additions & 0 deletions
diff --git a/‎VincentLeV/app/data/banned_books/pen-2021-2022.csv‎
Lines changed: 2533 additions & 0 deletions b/‎VincentLeV/app/data/banned_books/pen-2021-2022.csv‎
Lines changed: 2533 additions & 0 deletions
diff --git a/‎VincentLeV/app/data/banned_books/pen-2023-2024.csv‎
Lines changed: 10047 additions & 0 deletions b/‎VincentLeV/app/data/banned_books/pen-2023-2024.csv‎
Lines changed: 10047 additions & 0 deletions
diff --git a/‎VincentLeV/app/data/banned_books/pen-2023.csv‎
Lines changed: 267 additions & 0 deletions b/‎VincentLeV/app/data/banned_books/pen-2023.csv‎
Lines changed: 267 additions & 0 deletions
diff --git a/‎VincentLeV/app/home.py‎
Lines changed: 144 additions & 0 deletions b/‎VincentLeV/app/home.py‎
Lines changed: 144 additions & 0 deletions
diff --git a/‎VincentLeV/app/requirements.txt‎
Lines changed: 10 additions & 0 deletions b/‎VincentLeV/app/requirements.txt‎
Lines changed: 10 additions & 0 deletions
@@ -0,0 +1,5 @@
+/__pycache__
+__pycache__
+logs
+.env
+servers.json
@@ -0,0 +1,108 @@
+# Vincent's Capstone: Banned Books ETL Pipeline
+
+This capstone project demonstrates how to ochestrate the process of scraping PEN America website for banned books data, cleaning and transforming the data, then loading them to CSV files. The data then illustrated using Streamlit.
+
+## Tools
+
+- Python (3.12)
+- Pandas
+- PostgreSQL
+- Airflow (3.0.0+)
+- Streamlit
+
+---
+
+## Project Flow
+
+1. Scrape banned books data using Python
+2. Clean data using Pandas
+3. Extract data to CSV
+4. Save data to PostgreSQL
+5. Visualize data in Streamlit
+
+Step 4 is redundant in this project since I load the using the CSV files but I want to include this so that the project will be scalable later.
+
+![Diagram](./banned-books-pipeline-diagram.jpg)
+
+---
+
+## Project Structure
+```
+VincentLeV/
+├── app/
+│   ├── data/
+│   │   └── banned_books/
+│   │       ├── banned_books.csv     # The main dataset that is used for visualization
+│   │       └── ...
+│   ├── app.py                       # Streamlit home page 
+│   ├── ...    
+│   └── Dockerfile
+├── config/
+│   ├── generate_pgadmin_server.py   # Make sure that the server is ready in PgAdmin
+│   └── ...
+├── dags/
+│   └── get_banned_books.py          # Airflow DAG that handles the data processing process
+├── tasks/
+│   └── banned_books_taks.py         # Airflow tasks that are used in get_banned_books DAG
+├── utils/
+│   ├── constants.py                 # Common constants that are used in util functions/tasks/DAG
+│   └── transform_banned_books.py    # Util functions that handle clean and transform the data
+├── docker-compose.yaml
+├── start.sh                         # App start script
+└── stop.sh                          # App end script
+```
+
+## Data Visualization
+
+Streamlit app is deployed here
+
+```
+https://vincent-banned-books.streamlit.app/
+```
+
+## Setup and Run Locally
+
+### For Running the Project the First Time
+
+1. In the terminal, run these commands
+   ```bash
+   cd VincentLeV
+   
+   ./start.sh
+   ```
+
+   The terminal will prompt for some variable inputs, please type in the values as you want
+2. After Docker has completed the process, navigate to check out Airflow processes:
+   ```
+   http://localhost:8080/
+   ```
+3. If everything runs well in step 2, the data is ready, navigate to this page to check out the visualization:
+   ```
+   http://localhost:8502/
+   ```
+
+For curiousity, the data in PostgreSQL can be checked here
+```
+http://localhost:5050/
+```
+
+PgAdmin is pre-loaded with a server under the name that you input for this prompt `Enter pgadmin server name`
+
+Log into the DB with the password you provide in this prompt `Enter postgres password`, you will see that the data after this
+
+### For Running the Project Not the First Time
+
+In the terminal, run these commands
+```bash
+cd VincentLeV
+
+docker compose up -d
+```
+
+## Stop the app
+
+In the terminal, run this script when you are inside `VincentLeV` folder
+```bash
+
+./stop.sh
+```
@@ -0,0 +1,16 @@
+FROM python:3.9-slim
+
+WORKDIR /app
+
+RUN apt-get update && apt-get install -y \
+    libpq-dev gcc build-essential --no-install-recommends && \
+    rm -rf /var/lib/apt/lists/*
+
+COPY . .
+
+RUN pip install --upgrade pip
+RUN pip install --no-cache-dir -r requirements.txt
+
+EXPOSE 8502
+
+CMD ["streamlit", "run", "app.py", "--server.port=8502", "--server.address=0.0.0.0", "--server.fileWatcherType=poll"]
@@ -0,0 +1,13 @@
+import streamlit as st
+
+st.set_page_config(
+    page_icon=":books:",
+    layout="wide",
+)
+
+pg = st.navigation([
+    st.Page("home.py", title="Overview", icon=":material/home:"),
+    st.Page("states_and_districts.py", title="States and Districts", icon=":material/location_on:"),
+])
+
+pg.run()
@@ -0,0 +1,144 @@
+import streamlit as st
+import pandas as pd
+import plotly.express as px
+import os
+from utils import rank_dataframe
+from utils import get_base_data_url
+
+BASE_DATA_URL = get_base_data_url()
+DATA_URL = os.path.join(BASE_DATA_URL, "banned_books.csv")
+
+ban_colors = {
+    "banned": "#bf0603",
+    "banned from libraries and classrooms": "#ff6d00",
+    "banned by restriction": "#0096c7",
+    "banned pending investigation": "#ffea00"
+}
+
+@st.cache_data
+def load_data(path: str):
+    data = pd.read_csv(path)
+    return data
+
+def by_year_bar_chart(data: pd.DataFrame):
+    year_status_counts = (
+        data
+          .groupby(["Year", "Ban Status"])
+          ["Title"].nunique()
+          .reset_index(name="Titles")
+    )
+
+    fig = px.bar(
+        year_status_counts,
+        x="Year",
+        y="Titles",
+        color="Ban Status",
+        color_discrete_map=ban_colors,
+        labels={"Titles": "Titles", "Year": "Year", "Ban Status": "Ban Status"},
+        title="Banned Books by Year",
+        barmode="group"
+    )
+
+    st.plotly_chart(fig)
+
+def by_origin_of_challenge_bar_chart(data: pd.DataFrame):
+    year_status_counts = (
+        data
+          .groupby(["Origin of Challenge", "Ban Status"])
+          ["Title"].nunique()
+          .reset_index(name="Titles")
+    )
+
+    fig = px.bar(
+        year_status_counts,
+        x="Origin of Challenge",
+        y="Titles",
+        color="Ban Status",
+        color_discrete_map=ban_colors,
+        labels={"Titles": "Titles", "Origin of Challenge": "Origin of Challenge", "Ban Status": "Ban Status"},
+        title="Banned Books by Origin of Challenge",
+        barmode="group"
+    )
+
+    st.plotly_chart(fig)
+
+def top_5_banned_titles(data: pd.DataFrame):
+    filtered_data = data[(data["Ban Status"] == "banned") | (data["Ban Status"] == "banned from libraries and classrooms")]
+    title_counts = filtered_data.groupby(["Title", "Author"]).size().reset_index(name="Ban Count")
+    top_titles = title_counts.sort_values(by="Ban Count", ascending=False).head(5)
+    return top_titles[["Title", "Author", "Ban Count"]]
+
+def top_5_challenged_titles(data: pd.DataFrame):
+    filtered_data = data[(data["Ban Status"] == "banned by restriction") | (data["Ban Status"] == "banned pending investigation")]
+    title_counts = filtered_data.groupby(["Title", "Author"]).size().reset_index(name="Ban Count")
+    top_titles = title_counts.sort_values(by="Ban Count", ascending=False).head(5)
+    return top_titles[["Title", "Author", "Ban Count"]]
+
+def top_5_banned_authors(data: pd.DataFrame):
+    filtered_data = data[(data["Ban Status"] == "banned") | (data["Ban Status"] == "banned from libraries and classrooms")]
+    author_counts = filtered_data.groupby(["Author"]).size().reset_index(name="Count")
+    top_authors = author_counts.sort_values(by="Count", ascending=False).head(5)
+    return top_authors[["Author", "Count"]]
+
+def top_5_challenged_authors(data: pd.DataFrame):
+    filtered_data = data[(data["Ban Status"] == "banned by restriction") | (data["Ban Status"] == "banned pending investigation")]
+    author_counts = filtered_data.groupby(["Author"]).size().reset_index(name="Count")
+    top_authors = author_counts.sort_values(by="Count", ascending=False).head(5)
+    return top_authors[["Author", "Count"]]
+
+def display_data(data: pd.DataFrame):
+    st.title("Overview of Banned Books in the US (2021-2024)")
+
+    st.info('Data is pulled from https://pen.org/', icon="ℹ️")
+
+    cols1= st.columns([0.3, 0.7], vertical_alignment="center")
+
+    unique_titles = data["Title"].nunique()
+    cols1[0].markdown(f"<p style='text-align: center; font-size: 2.5rem; font-weight: bold;'>{unique_titles}</p>", unsafe_allow_html=True)
+    cols1[0].markdown(f"<p style='text-align: center;'>books are banned between 2021 and 2024</p>", unsafe_allow_html=True)
+
+    unique_states = data["State"].nunique()
+    cols1[0].markdown(f"<p style='text-align: center; font-size: 2.1rem; font-weight: bold;'>{unique_states}</p>", unsafe_allow_html=True)
+    cols1[0].markdown(f"<p style='text-align: center;'>states are involved</p>", unsafe_allow_html=True)
+
+    unique_districts = data["District"].nunique()
+    cols1[0].markdown(f"<p style='text-align: center; font-size: 2.1rem; font-weight: bold;'>{unique_districts}</p>", unsafe_allow_html=True)
+    cols1[0].markdown(f"<p style='text-align: center;'>districts are involved</p>", unsafe_allow_html=True)
+    
+    with cols1[1]:
+      by_year_bar_chart(data)
+
+    by_origin_of_challenge_bar_chart(data)
+
+    cols2= st.columns([0.5, 0.5], vertical_alignment="center")
+
+    with cols2[0]:
+      st.subheader("Top 5 Most Banned Titles")
+      top_titles = top_5_banned_titles(data)
+      ranked_titles = rank_dataframe(top_titles, rank_column_name="Rank")
+      st.dataframe(ranked_titles.set_index("Rank"))
+
+    with cols2[1]:
+      st.subheader("Top 5 Most Banned Authors")
+      top_authors = top_5_banned_authors(data)
+      ranked_authors = rank_dataframe(top_authors, rank_column_name="Rank")
+      st.dataframe(ranked_authors.set_index("Rank"))
+
+    cols3= st.columns([0.5, 0.5], vertical_alignment="center")
+
+    with cols3[0]:
+      st.subheader("Top 5 Most Challenged Titles")
+      top_challenged_titles = top_5_challenged_titles(data)
+      ranked_challenged_titles = rank_dataframe(top_challenged_titles, rank_column_name="Rank")
+      st.dataframe(ranked_challenged_titles.set_index("Rank"))
+
+    with cols3[1]:
+      st.subheader("Top 5 Most Challenged Authors")
+      top_challenged_authors = top_5_challenged_authors(data)
+      ranked_challenged_authors = rank_dataframe(top_challenged_authors, rank_column_name="Rank")
+      st.dataframe(ranked_challenged_authors.set_index("Rank"))
+
+display_data(load_data(DATA_URL))
+
+
+
@@ -0,0 +1,10 @@
+python-dotenv
+pandas
+streamlit
+plotly
+requests 
+beautifulsoup4
+apache-airflow
+apache-airflow-providers-common-sql
+apache-airflow-providers-postgres
+apache-airflow-providers-standard