datatribe-collective-labs
diff --git a/‎komalazram/.flake8‎
Lines changed: 2 additions & 0 deletions b/‎komalazram/.flake8‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎komalazram/.github/workflows/CICD.yml‎
Lines changed: 123 additions & 0 deletions b/‎komalazram/.github/workflows/CICD.yml‎
Lines changed: 123 additions & 0 deletions
diff --git a/‎komalazram/.gitignore‎
Lines changed: 38 additions & 0 deletions b/‎komalazram/.gitignore‎
Lines changed: 38 additions & 0 deletions
diff --git a/‎komalazram/README.md‎
Lines changed: 70 additions & 0 deletions b/‎komalazram/README.md‎
Lines changed: 70 additions & 0 deletions
diff --git a/‎komalazram/architecture.png‎
114 KB b/‎komalazram/architecture.png‎
114 KB
diff --git a/‎komalazram/dags/datalake_dags.py‎
Lines changed: 89 additions & 0 deletions b/‎komalazram/dags/datalake_dags.py‎
Lines changed: 89 additions & 0 deletions
diff --git a/‎komalazram/data/lake_gold.gold_station_utilization_weekly.sql‎
Lines changed: 9 additions & 0 deletions b/‎komalazram/data/lake_gold.gold_station_utilization_weekly.sql‎
Lines changed: 9 additions & 0 deletions
diff --git a/‎komalazram/data/lake_silver._staging_master_bike_station_status.sql‎
Lines changed: 18 additions & 0 deletions b/‎komalazram/data/lake_silver._staging_master_bike_station_status.sql‎
Lines changed: 18 additions & 0 deletions
diff --git a/‎komalazram/data/lake_silver.master_bike_station_status.sql‎
Lines changed: 17 additions & 0 deletions b/‎komalazram/data/lake_silver.master_bike_station_status.sql‎
Lines changed: 17 additions & 0 deletions
@@ -0,0 +1,2 @@
+[flake8]
+max-line-length = 150
@@ -0,0 +1,123 @@
+name: ETL CI/CD
+
+on:
+  push:
+    branches:
+      - master
+  pull_request:
+
+jobs:
+  ci:
+    name: CI - Lint, Terraform Check, and Plan
+    runs-on: ubuntu-latest
+    environment: production
+
+    steps:
+      - uses: actions/checkout@v3
+
+      - name: Set up Terraform
+        uses: hashicorp/setup-terraform@v2
+        with:
+          terraform_version: 1.4.6
+          terraform_wrapper: false
+
+      - name: Authenticate to Google Cloud
+        uses: google-github-actions/auth@v1
+        with:
+          credentials_json: ${{ secrets.GCP_SA_KEY }}
+
+      - name: Set Terraform Environment Variables
+        run: |
+          echo "TF_VAR_project_id=${{ secrets.PROJECT_ID }}" >> $GITHUB_ENV
+          echo "TF_VAR_bucket_names=${{ secrets.BUCKET_NAME }}" >> $GITHUB_ENV
+          echo "TF_VAR_bucket_location=${{ secrets.BUCKET_LOCATION }}" >> $GITHUB_ENV
+          echo "TF_VAR_citibike_composer_name=${{ secrets.citibike_composer_name }}" >> $GITHUB_ENV
+          echo "TF_VAR_composer_region=${{ secrets.COMPOSER_REGION }}" >> $GITHUB_ENV
+          echo "TF_VAR_composer_service_account=${{ secrets.COMPOSER_SERVICE_ACCOUNT }}" >> $GITHUB_ENV
+
+      - name: Terraform Format Check
+        run: |
+          cd terraform
+          terraform fmt -check -recursive
+
+      - name: Terraform Init
+        run: |
+          cd terraform
+          terraform init
+
+      - name: Terraform Validate
+        run: |
+          cd terraform
+          terraform validate
+
+      - name: Terraform Plan
+        run: |
+          cd terraform
+          terraform plan -input=false
+
+      - name: Install Poetry
+        uses: snok/install-poetry@v1
+
+      - name: Install Dependencies
+        run: poetry install
+
+      - name: Python Lint
+        run: poetry run flake8 .
+
+  cd:
+    name: CD - Deploy Infra and DAGs
+    needs: ci
+    if: github.ref == 'refs/heads/master'
+    runs-on: ubuntu-latest
+    environment: production
+
+    steps:
+      - uses: actions/checkout@v3
+
+      - name: Set up Terraform
+        uses: hashicorp/setup-terraform@v2
+        with:
+          terraform_version: 1.4.6
+          terraform_wrapper: false
+
+      - name: Authenticate to Google Cloud
+        uses: google-github-actions/auth@v1
+        with:
+          credentials_json: ${{ secrets.GCP_SA_KEY }}
+
+      - name: Set Terraform Environment Variables
+        run: |
+          echo "TF_VAR_project_id=${{ secrets.PROJECT_ID }}" >> $GITHUB_ENV
+          echo "TF_VAR_bucket_names=${{ secrets.BUCKET_NAME }}" >> $GITHUB_ENV
+          echo "TF_VAR_bucket_location=${{ secrets.BUCKET_LOCATION }}" >> $GITHUB_ENV
+          echo "TF_VAR_citibike_composer_name=${{ secrets.citibike_composer_name }}" >> $GITHUB_ENV
+          echo "TF_VAR_composer_region=${{ secrets.COMPOSER_REGION }}" >> $GITHUB_ENV
+          echo "TF_VAR_composer_service_account=${{ secrets.COMPOSER_SERVICE_ACCOUNT }}" >> $GITHUB_ENV
+
+      - name: Terraform Init
+        run: |
+          cd terraform
+          terraform init
+
+      - name: Terraform Apply
+        run: |
+          cd terraform
+          terraform apply -auto-approve
+
+      - name: Get Composer Bucket Name
+        run: |
+          cd terraform
+          COMPOSER_BUCKET=$(terraform output -raw composer_bucket_name 2>/dev/null || echo "")
+          echo "COMPOSER_BUCKET=$COMPOSER_BUCKET" >> $GITHUB_ENV
+
+      - name: Deploy DAGs to Composer
+        run: |
+          gcloud storage cp --recursive dags gs://$COMPOSER_BUCKET/
+
+      - name: Deploy Plugins to Composer
+        run: |
+          gcloud storage cp --recursive plugins gs://$COMPOSER_BUCKET/
+
+      - name: Deploy Data to Composer
+        run: |
+          gcloud storage cp --recursive data gs://$COMPOSER_BUCKET/
@@ -0,0 +1,38 @@
+# Local .terraform directories
+.terraform/
+
+# .tfstate files
+*.tfstate
+*.tfstate.*
+
+# Crash log files
+crash.log
+crash.*.log
+
+# Exclude all .tfvars files, which are likely to contain sensitive data, such as
+# password, private keys, and other secrets. These should not be part of version 
+# control as they are data points which are potentially sensitive and subject 
+# to change depending on the environment.
+#*.tfvars
+#*.tfvars.json
+
+# Ignore override files as they are usually used to override resources locally and so
+# are not checked in
+override.tf
+override.tf.json
+*_override.tf
+*_override.tf.json
+
+# Ignore transient lock info files created by terraform apply
+.terraform.tfstate.lock.info
+
+# Include override files you do wish to add to version control using negated pattern
+# !example_override.tf
+
+# Include tfplan files to ignore the plan output of command: terraform plan -out=tfplan
+# example: *tfplan*
+
+# Ignore CLI configuration files
+.terraformrc
+terraform.rc
+terraform.tfvars
@@ -0,0 +1,70 @@
+#  Citi Bike ETL Workflow for Analytics-Ready Data
+
+This project builds an ETL pipeline that consumes Citi Bike’s open API data and stores it in a well-structured gold-level table. The cleaned and unified data is designed to support future analytics and decision-making processes, such as:
+
+- Identifying stations that are frequently full or empty.
+- Improving resource allocation for bike availability.
+- Analyzing station-level usage patterns.
+
+
+---
+
+## Architecture
+
+![Architecture Diagram](architecture.png)
+
+The pipeline takes a weekly snapshot and provides the updated data. It follows **Bronze → Silver → Gold** data architecture pattern:
+
+### Bronze Layer
+- Extracts and loads raw Citi Bike data from the API.
+- Stores raw JSON files in Google Cloud Storage (GCS).
+- Performs JSON schema validation.
+
+### Silver Layer
+- Cleans and flattens raw JSON data using Python, handling timestamp formats, missing values, and data type conversions.
+- Writes cleaned data to a GCS Silver bucket and loads it into a BigQuery staging table.
+- Uses a MERGE operation to update the master BigQuery table with only new or changed records from the staging table.
+
+### Gold Layer
+- Aggregates and curates data for reporting and dashboarding.
+
+---
+
+## Technologies Used
+
+| Tool             | Purpose                                |
+|------------------|----------------------------------------|
+| Python           | Data extraction, validation, transformation |
+| Google Cloud Storage | Stores raw and intermediate data       |
+| BigQuery         | Data warehousing and analytics          |
+| Airflow/Composer | Orchestrates the data pipeline          |
+| Terraform        | Infrastructure as Code (IaC)            |
+| GitHub Actions   | CI/CD for deployment workflows          |
+
+---
+
+## Folder Structure
+
+```bash
+.
+├── dags/                     # Airflow DAGs
+├── plugins/                   # Python scripts for ETL
+├── sql/                      # Schema definition files for BigQuery tables
+├── terraform/                # Infrastructure configuration
+├── data/                     # SQL for Silver (staging) and Gold (aggregation) layers
+├── README.md                 # Project documentation
+
+```
+
+## Setup Instructions
+1. Clone the repository
+2. Add your GCP Service Account JSON key as a secret in your GitHub repository ( GCP_SA_KEY)
+3. Install dependencies using Poetry
+    ```
+    poetry install
+    ```
+4. Modify variable values in the terraform files to match your GCP project, region, and desired bucket names
+5. Update environment secrets in github
+6. Pass bucket names and other runtime parameters to the DAGs
+7. Push the code to github and it will trigger a CI/CD workflow on master branch
+
@@ -0,0 +1,89 @@
+from datetime import datetime
+from airflow import DAG
+from airflow.providers.google.cloud.transfers.gcs_to_bigquery import (
+    GCSToBigQueryOperator,
+)
+from airflow.providers.google.cloud.operators.gcs import GCSDeleteObjectsOperator
+from airflow.operators.python import PythonOperator
+from airflow.providers.google.cloud.operators.bigquery import BigQueryInsertJobOperator
+
+
+from citi_bike_scrapper_bronze import fetch_and_upload
+from data_transformation_silver import raw_transformation
+
+
+default_args = {
+    "owner": "airflow",
+    "depends_on_past": False,
+    "start_date": datetime(2024, 1, 1),
+    "retries": 2,
+}
+
+with DAG(
+    dag_id="citi_bike_bronze_ingestion",
+    default_args=default_args,
+    schedule_interval="0 10 * * 1",  # Every Monday 10:00 AM
+    catchup=False,
+    template_searchpath="/home/airflow/gcs/data/",
+    tags=["citi_bike"],
+    params={"bronze_bucket": "bronze113", "silver_bucket": "silver113"},
+) as dag:
+
+    ingest_task = PythonOperator(
+        task_id="ingest_to_bronze",
+        python_callable=fetch_and_upload,
+        op_kwargs={"bucket_name": "{{ params.bronze_bucket }}"},
+    )
+
+    transform_task = PythonOperator(
+        task_id="transform_to_silver",
+        python_callable=raw_transformation,
+        op_kwargs={"bucket_name": "{{ params.silver_bucket }}"},
+    )
+
+    load_parquet_to_staging = GCSToBigQueryOperator(
+        task_id="load_parquet_to_staging",
+        bucket="silver113",
+        source_objects=["citi-bike/*.parquet"],
+        destination_project_dataset_table="citi-bike-459310.lake_silver._staging_master_bike_station_status",
+        source_format="PARQUET",
+        write_disposition="WRITE_APPEND",
+        create_disposition="CREATE_NEVER",
+        autodetect=True,
+        ignore_unknown_values=True,
+        project_id="citi-bike-459310",
+    )
+
+    clear_silver_task = GCSDeleteObjectsOperator(
+        task_id="clear_silver_folder",
+        bucket_name="silver113",
+        prefix="citi-bike/",
+    )
+    merge_staging_to_master = BigQueryInsertJobOperator(
+        task_id="merge_staging_to_master",
+        configuration={
+            "query": {
+                "query": "{% include 'merge_staging_to_master.sql' %}",
+                "useLegacySql": False,
+            }
+        },
+    )
+
+    populate_gold_table = BigQueryInsertJobOperator(
+        task_id="populate_gold_station_utilization_weekly",
+        configuration={
+            "query": {
+                "query": "{% include 'populate_gold_station_utilization_weekly.sql' %}",
+                "useLegacySql": False,
+            }
+        },
+    )
+
+    (
+        ingest_task
+        >> transform_task
+        >> load_parquet_to_staging
+        >> clear_silver_task
+        >> merge_staging_to_master
+        >> populate_gold_table
+    )
@@ -0,0 +1,9 @@
+CREATE TABLE IF NOT EXISTS lake_gold.gold_station_utilization_weekly (
+  station_id STRING,
+  network_id STRING,
+  week_start DATE,
+  is_frequently_full BOOL,
+  is_frequently_empty BOOL,
+  avg_free_bikes FLOAT64,
+  avg_empty_slots FLOAT64
+);
@@ -0,0 +1,18 @@
+CREATE TABLE IF NOT EXISTS `citi-bike-459310.lake_silver._staging_master_bike_station_status` (
+  network_id STRING,
+  network_name STRING,
+  station_id STRING,
+  latitude FLOAT64,
+  longitude FLOAT64,
+  timestamp TIMESTAMP,
+  free_bikes INT64,
+  empty_slots INT64, 
+  extra_uid STRING,
+  renting BOOL,
+  returning BOOL,
+  has_ebikes BOOL,
+  ebikes INT64,
+  snapshot_time TIMESTAMP
+)
+PARTITION BY DATE(timestamp)
+CLUSTER BY station_id;
@@ -0,0 +1,17 @@
+CREATE TABLE IF NOT EXISTS `citi-bike-459310.lake_silver.master_bike_station_status` (
+  network_id STRING,
+  network_name STRING,
+  station_id STRING,
+  latitude FLOAT64,
+  longitude FLOAT64,
+  timestamp TIMESTAMP,
+  free_bikes INT64,
+  empty_slots INT64, 
+  extra_uid STRING,
+  renting BOOL,
+  returning BOOL,
+  has_ebikes BOOL,
+  ebikes INT64
+)
+PARTITION BY DATE(timestamp)
+CLUSTER BY station_id;