0% found this document useful (0 votes)
54 views

Analytical SQL Documentation

The document appears to be an analytical case study submitted by Hafsa Ahmed Zahran to Eng. Yara Abdallah. It contains 5 steps of SQL queries analyzing an online retail dataset. The queries explore the dataset, analyze customer purchasing patterns, rank countries by various sales metrics, identify profitable months, and determine the top 10 returned products. The overall goal appears to be gaining business insights from the transaction data to improve operations and sales.

Uploaded by

Hafsa Zahran
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as DOCX, PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
54 views

Analytical SQL Documentation

The document appears to be an analytical case study submitted by Hafsa Ahmed Zahran to Eng. Yara Abdallah. It contains 5 steps of SQL queries analyzing an online retail dataset. The queries explore the dataset, analyze customer purchasing patterns, rank countries by various sales metrics, identify profitable months, and determine the top 10 returned products. The overall goal appears to be gaining business insights from the transaction data to improve operations and sales.

Uploaded by

Hafsa Zahran
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as DOCX, PDF, TXT or read online on Scribd
You are on page 1/ 10

Data Visualization Track

Information Technology Institute (ITI)


Smart Village

Analytical SQL Case Study

Submitted by: Hafsa Ahmed Zahran


Submitted to: Eng. Yara Abdallah

2023
1
Q1: The Query

Step 1:
Exploring the dataset by getting the number of distinct invoices, products, customers, and places

with ex_table (num_total_invoice, num_sold_invoice, num_canceled_invoice ,num_product,


num_customer, num_country)

as(

select

distinct count(invoiceno)as num_total_invoice,

(select distinct count(invoiceno) from Online_Retail where invoiceno not like 'C%') as
num_sold_invoice,

(select distinct count(invoiceno) from Online_Retail where invoiceno like 'C%') as


num_canceled_invoice,

to_char(count(distinct stockcode),'fm999G999G999') as num_product,

to_char(count(distinct customerid),'fm999G999G999') as num_customer,

to_char(count(distinct country),'fm999G999G999') as num_country

from Online_Retail

select

to_char(num_total_invoice ,'fm999G999G999') as num_total_invoice,

to_char(num_sold_invoice,'fm999G999G999') as num_sold_invoice,

round((cast(num_sold_invoice as decimal)/(num_total_invoice))*100, 0) as per_sold_invoice,

to_char(num_canceled_invoice ,'fm999G999G999') as num_canceled_invoice,

round((cast(num_canceled_invoice as decimal)/(num_total_invoice))*100, 0) as per_canceled_invoice,

num_product, num_customer, num_country

from ex_table

2
Step 2:
To count for each customer the number of invoices they did and order them by the count from the max
to min.

---using with clause & analytical function

with c_table (CustomerID,invoiceno, country,quantity , unitprice,rnk_invoice)

as (

select CustomerID,invoiceno ,country, quantity , unitprice ,

dense_rank () over (partition by CustomerID order by invoiceno) as rnk_invoice

from Online_Retail

select distinct CustomerID,

max(rnk_invoice) over (partition by CustomerID ) as count_invoice

from c_table

order by count_invoice desc

---without analytical function

select CustomerID, count ( distinct InvoiceNo) as num_invoice

from Online_Retail

group by CustomerID

order by num_invoice desc

3
Step 3:
To rank the country according to number of customers, income they record and the number of invoices
they produce and select the most ordered product and most profitable product.

with c_table (CustomerID,invoiceno, country,quantity , unitprice,description,cnt_invoice,cnt_customer,


fv, pr_country, pr_product)

as (

select CustomerID,invoiceno ,country, quantity , unitprice , description,

dense_rank () over (partition by country order by invoiceno) as cnt_invoice,

dense_rank () over (partition by country order by CustomerID) as cnt_customer,

first_value(description)over (partition by country order by quantity desc) as fv,

sum (quantity * cast (unitprice as decimal)) over (partition by country) as pr_country,

sum (quantity * cast (unitprice as decimal)) over (partition by country, description) as


pr_product

from Online_Retail

select distinct country,

max(cnt_customer) over (partition by country) as count_customer,

max(cnt_invoice) over (partition by country) as count_invoice,

round (pr_country, 2)as Profit ,

fv as most_ordered_product,

first_value(description)over (partition by country order by pr_product desc) as most_profitable_product

from c_table

order by Profit desc

4
Step 4:
I tried to find out the most profitable month out of the year and its percent rank.

with c_table (year, month , profit ) as

select to_char (TO_TIMESTAMP(invoicedate, 'MM-DD-YYYY HH24:MI:SS'), 'YYYY') as year,

to_char (TO_TIMESTAMP(invoicedate, 'MM-DD-YYYY HH24:MI:SS'), 'Month') as month ,

round (sum (quantity * cast (unitprice as decimal)), 2) as profit

from Online_Retail

group by year, month

select *,

round (cast(percent_rank ()over (partition by year order by profit desc )*100 as decimal) ,2) as
profit_percent_rank

from c_table

order by year, profit desc

5
Step 5:
To get the top 10 returned product

with t_table (description, quantity,price, loss )

as(

select description , sum(quantity)*-1 as quantity , max(unitprice) as price ,

round(sum (quantity* cast (unitprice as decimal)), 2) as loss

from Online_Retail

where invoiceno like 'C%'

group by description

having sum (quantity) <0

select *

from (

select *, rank () over (order by loss) as rnk

from t_table) t

where rnk <11

6
Q1: The Story

Our business is an Online retail business which mean that customers have a plethora of options
in searching for, selecting, and purchasing products, information, and services over the
internet.
The business transactions have been recorded over many years and countries, to get an
overview of how the business is going. I thought of analyzing the dataset using SQL.
1. Firstly, I wanted to explore the ongoing business process by answering the following
questions:
- How many invoices did the business did? and how many of them have really
been sold and how many cancelled? and each percentage out of total?
- How many products does the business sell?
- How many customers does the business serve?
- How many counties does the business cover?
to answer these questions, I wrote the query that in step 1, and I found that the
cancelled invoices represent 2% of the total invoices which is a small percentage which
may indicate that our customers are satisfied by the products quality.

2. So, I thought to know the business repeated customers more, so I run the query that is
in step 2, in this query I ordered the customers according to the number of invoices that
they get. When the number of invoices increases this may indicate that this is a frequent
customer who visits our store regularly. I noticed that the customer with the most
number. of invoices doesn't have and id but I decided to keep it in the data
representation to inform the business about this issue.

3. I wrote the query in step 3 cause, I had been inquiring what is the most profitable
country to our business so we can maintain our spread there, and what's the least
profitable countries, in order to give them more attention and try to schedule more
marketing campaigns. I also wanted to see if the most ordered product does mean that
it’s the most profitable product or not.
in some cases, it was the same in other it wasn't, in cases that is not I recommend that
the business try to turn the most profitable product into most ordered too.

4. Later I was wondering if there is a trend in the salles over the last years so run the query
that is in step 4, and I found that the sales may increase in the q2 and q4.

5. Finally, I thought in taking a step forward and try to know the top 10 returned products
so the business may get red off it, and I calculated how much loss they did to the
business by running the query in step 5.
Note: You can navigate by clicking on the step.

7
Q2: Customer Segmentation

---recency , frequency and monetary

with collection_tabel(customerid, recency,frequency, monetary )

as (

select customerid,

((select max(TO_date(invoicedate,'MM-DD-YYYY' )) from Online_Retail) - max


(TO_date(invoicedate, 'MM-DD-YYYY'))) as recency ,

count(invoiceno) as frequency,

sum(unitprice* quantity) as monetary

from Online_Retail

where customerid != ''

group by customerid

---recency score, frequency and monetary score

calc_rfm (customerid, recency , frequency, monetary,R_S,F_S, M_S ) AS (

select *,

ntile(5) over (order by recency desc) as R_S,

ntile(5) over (order by frequency asc) as F_S,

ntile(5) over (order by monetary asc) as M_S

from collection_tabel

--- joining tables, rounding monetary and frequency and monetary AVG score

t_table(customerid,recency, frequency, monetary , r_score, fm_score) as (

select co.customerid as customerid , co.recency as recency, co.frequency as frequency,

round (cast (co.monetary as decimal ),2) as monetary

8
,ca.R_S as r_score , ((ca.F_S+ ca.M_S)/2) as fm_score

from collection_tabel co,calc_rfm ca

where co.customerid= ca.customerid)

---customer segmentation

select * ,

(CASE WHEN r_score in(5,4) AND fm_score in(5,4)

THEN 'Champions'

WHEN (r_score = 5 AND fm_score= 2) or (r_score = 4 AND fm_score= 2) or

(r_score = 3 AND fm_score= 3) or (r_score = 4 AND fm_score= 3)

THEN 'Potential Loyalists'

WHEN (r_score = 5 AND fm_score= 3) or (r_score = 4 AND fm_score= 4) or

(r_score = 3 AND fm_score= 5) or (r_score = 3 AND fm_score= 4)

THEN 'Loyal Customers'

WHEN (r_score = 5 AND fm_score= 1)

THEN 'Recent Customers'

WHEN r_score in(4,3) AND fm_score in(1)

THEN 'Promising'

WHEN r_score in(3,2) AND fm_score in(3,2)

THEN 'Customers Needing Attention'

WHEN (r_score = 2 AND fm_score= 5) or (r_score = 2 AND fm_score= 4) or

(r_score = 1 AND fm_score= 3)

THEN 'At Risk'

9
WHEN (r_score = 1 AND fm_score= 5) or (r_score = 1 AND fm_score= 4)

THEN 'Cant Lose Them'

WHEN r_score in(1) AND fm_score in(2)

THEN 'Hibernating'

WHEN r_score in(1) AND fm_score in(1)

THEN 'Lost'

END) as cust_segment

from t_table

10

You might also like