Skip to content

Commit 5fe7e21

Browse files
author
way
committed
add SZTcard
1 parent 2ebcbab commit 5fe7e21

File tree

6 files changed

+450
-1
lines changed

6 files changed

+450
-1
lines changed

.gitignore

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
11
/.idea/
22
/rent.db
3-
*html
3+
*html
4+
*csv

README.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,8 +17,10 @@
1717
| [1 亿条淘宝用户行为数据分析](https://github.com/TurboWay/bigdata_analyse/blob/master/UserBehaviorFromTaobao_Batch/用户行为数据分析.md) | 清洗 hive + 分析 hive + 可视化 echarts | [阿里云](https://tianchi.aliyun.com/dataset/dataDetail?dataId=649&userId=1) 或者 [百度网盘](https://pan.baidu.com/s/15Ss-nDMA120EHhuwpzYm0g) 提取码:5ipq |
1818
| [1000 万条淘宝用户行为数据实时分析](https://github.com/TurboWay/bigdata_analyse/blob/master/UserBehaviorFromTaobao_Stream/用户行为数据实时分析.md) | 数据源 kafka + 实时分析 flink + 可视化(es + kibana) | [百度网盘](https://pan.baidu.com/s/1wDVQpRV7giIlLJJgRZAInQ) 提取码:gja5 |
1919
| [300 万条《野蛮时代》的玩家数据分析](https://github.com/TurboWay/bigdata_analyse/blob/master/AgeOfBarbarians/野蛮时代数据分析.md) | 清洗 pandas + 分析 mysql + 可视化 pyecharts | [百度网盘](https://pan.baidu.com/s/1Mi5lvGDF405Nk8Y2BZDzdQ) 提取码:paq4 |
20+
| [130 万条深圳通刷卡数据分析](https://github.com/TurboWay/bigdata_analyse/blob/master/SZTcard/深圳通刷卡数据分析.md) | 清洗 pandas + 分析 impala + 可视化 dbeaver | [百度网盘](https://pan.baidu.com/s/1WslwKXKhVH1q_6u4SvuKkQ) 提取码:t561 |
2021
| [7000 条租房数据分析](https://github.com/TurboWay/bigdata_analyse/blob/master/RentFromDanke/租房数据分析.md) | 清洗 pandas + 分析 sqlite + 可视化 matplotlib | [百度网盘](https://pan.baidu.com/s/1l1x5qurJdkyUxAuhknj_Qw) 提取码:9en3 |
2122

2223
## refer
2324

2425
> 1. [https://tianchi.aliyun.com/dataset/](https://tianchi.aliyun.com/dataset/)
26+
> 2. [https://opendata.sz.gov.cn/data/api/toApiDetails/29200_00403601](https://opendata.sz.gov.cn/data/api/toApiDetails/29200_00403601)

SZTcard/analyse.sql

Lines changed: 196 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,196 @@
1+
-- 乘客主题
2+
3+
-- (整体) 通勤费用
4+
select '整体' deal_type,
5+
count(1) as cnt,
6+
sum(deal_money) / 100 as total,
7+
avg(deal_money) / 100 as per
8+
from sztcard
9+
where deal_type in ('地铁出站', '巴士')
10+
union all
11+
select deal_type,
12+
count(1) as cnt,
13+
sum(deal_money) / 100 as total,
14+
avg(deal_money) / 100 as per
15+
from sztcard
16+
where deal_type in ('地铁出站', '巴士')
17+
group by deal_type;
18+
19+
-- 优惠情况
20+
select case when a.distinct_count = 1 then '全票'
21+
when a.distinct_count = 0.95 then '9.5 折'
22+
when a.distinct_count >= 0.9 then '9 折'
23+
when a.distinct_count >= 0.85 then '8.5 折'
24+
when a.distinct_count >= 0.75 then '7.5 折'
25+
when a.distinct_count >= 0.5 then '半票'
26+
when a.distinct_count = 0 then '免票'
27+
end as distinct_count_range,
28+
sum(cn) as cn
29+
from(
30+
select deal_money / deal_value as distinct_count, count(1) as cn
31+
from sztcard
32+
where deal_value > 0
33+
group by deal_money / deal_value
34+
) as a
35+
group by case when a.distinct_count = 1 then '全票'
36+
when a.distinct_count = 0.95 then '9.5 折'
37+
when a.distinct_count >= 0.9 then '9 折'
38+
when a.distinct_count >= 0.85 then '8.5 折'
39+
when a.distinct_count >= 0.75 then '7.5 折'
40+
when a.distinct_count >= 0.5 then '半票'
41+
when a.distinct_count = 0 then '免票'
42+
end;
43+
44+
-- (整体) 出行时间分布
45+
select hour(deal_date) as h, count(1) as ct
46+
from sztcard
47+
where deal_type in ('地铁入站', '巴士')
48+
group by hour(deal_date)
49+
order by h;
50+
51+
-- (地铁) 通勤时间
52+
with tt as(
53+
select *, row_number() over( partition by card_no order by deal_date) as px
54+
from sztcard
55+
where deal_type rlike '地铁'
56+
),
57+
tt2 as(
58+
select t1.card_no,
59+
t1.deal_type as in_type, t1.company_name as in_company, t1.station as in_station, t1.deal_date as in_date,
60+
t2.deal_type as out_type, t2.company_name as out_company, t2.station as out_station, t2.deal_date as out_date,
61+
unix_timestamp(t2.deal_date) - unix_timestamp(t1.deal_date) as diff_sec
62+
from tt as t1
63+
inner join tt as t2 on t1.card_no = t2.card_no and t1.px = t2.px - 1
64+
where t2.deal_type = '地铁出站'
65+
and t1.deal_type = '地铁入站'
66+
and t1.station <> t2.station
67+
and substring(t1.deal_date, 1, 10) = '2018-09-01'
68+
and substring(t2.deal_date, 1, 10) = '2018-09-01'
69+
)
70+
71+
select avg(diff_sec)/60 from tt2;
72+
73+
74+
-- 地铁主题
75+
76+
-- (基于站点) 进站 top
77+
select station, count(1) as cn
78+
from sztcard
79+
where deal_type = '地铁入站'
80+
and station > ''
81+
group by station
82+
order by cn desc
83+
limit 10;
84+
85+
-- (基于站点) 出站 top
86+
select station, count(1) as cn
87+
from sztcard
88+
where deal_type = '地铁出站'
89+
and station > ''
90+
group by station
91+
order by cn desc
92+
limit 10;
93+
94+
-- (基于站点) 进出站 top
95+
select station, count(1) as cn
96+
from sztcard
97+
where deal_type in ('地铁出站', '地铁入站')
98+
and station > ''
99+
group by station
100+
order by cn desc
101+
limit 10;
102+
103+
-- (基于站点) 站点收入 top
104+
select station, sum(deal_money) / 100 as sm
105+
from sztcard
106+
where deal_type in ('地铁出站', '地铁入站')
107+
and station > ''
108+
group by station
109+
order by sm desc
110+
limit 10;
111+
112+
-- (基于线路) 运输贡献度 top
113+
-- 进站算一次,出站并且联程算一次
114+
select company_name, count(1) as cn
115+
from sztcard
116+
where company_name rlike '地铁'
117+
and (deal_type = '地铁出站' and conn_mark = '1' or deal_type = '地铁入站')
118+
group by company_name
119+
order by cn desc;
120+
121+
-- (基于线路) 运输效率 top
122+
-- 每条线路单程直达乘客耗时平均值排行榜
123+
with tt as(
124+
select *, row_number() over( partition by card_no order by deal_date) as px
125+
from sztcard
126+
where deal_type rlike '地铁'
127+
),
128+
tt2 as(
129+
select t1.card_no,
130+
t1.deal_type as in_type, t1.company_name as in_company, t1.station as in_station, t1.deal_date as in_date,
131+
t2.deal_type as out_type, t2.company_name as out_company, t2.station as out_station, t2.deal_date as out_date,
132+
unix_timestamp(t2.deal_date) - unix_timestamp(t1.deal_date) as diff_sec
133+
from tt as t1
134+
inner join tt as t2 on t1.card_no = t2.card_no and t1.px = t2.px - 1
135+
where t2.deal_type = '地铁出站'
136+
and t1.deal_type = '地铁入站'
137+
and t1.station <> t2.station
138+
and substring(t1.deal_date, 1, 10) = '2018-09-01'
139+
and substring(t2.deal_date, 1, 10) = '2018-09-01'
140+
)
141+
142+
select in_company, avg(diff_sec) / 60 avg_min
143+
from tt2
144+
where in_company = out_company
145+
group by in_company
146+
order by avg_min;
147+
148+
-- (基于线路) 换乘比例 top
149+
-- 每线路换乘出站乘客百分比排行榜
150+
with tt as(
151+
select *, row_number() over( partition by card_no order by deal_date) as px
152+
from sztcard
153+
where deal_type rlike '地铁'
154+
),
155+
tt2 as(
156+
select t1.card_no,
157+
t1.deal_type as in_type, t1.company_name as in_company, t1.station as in_station, t1.deal_date as in_date,
158+
t2.deal_type as out_type, t2.company_name as out_company, t2.station as out_station, t2.deal_date as out_date,
159+
t2.conn_mark,
160+
unix_timestamp(t2.deal_date) - unix_timestamp(t1.deal_date) as diff_sec
161+
from tt as t1
162+
inner join tt as t2 on t1.card_no = t2.card_no and t1.px = t2.px - 1
163+
where t2.deal_type = '地铁出站'
164+
and t1.deal_type = '地铁入站'
165+
and t1.station <> t2.station
166+
and substring(t1.deal_date, 1, 10) = '2018-09-01'
167+
and substring(t2.deal_date, 1, 10) = '2018-09-01'
168+
)
169+
170+
select out_company, sum(case when conn_mark = '1' then 1 else 0 end) / count(1) as per
171+
from tt2
172+
group by out_company
173+
order by per desc;
174+
175+
-- (基于线路) 线路收入 top
176+
select company_name, sum(deal_money) / 100 as sm
177+
from sztcard
178+
where deal_type rlike '地铁'
179+
group by company_name
180+
order by sm desc;
181+
182+
-- 巴士主题
183+
184+
-- (基于公司) 巴士公司收入 top
185+
select company_name, sum(deal_money) / 100 as sm
186+
from sztcard
187+
where deal_type not rlike '地铁'
188+
group by company_name
189+
order by sm desc;
190+
191+
-- (基于公司) 巴士公司贡献度 top
192+
select company_name, count(1) as cn
193+
from sztcard
194+
where deal_type not rlike '地铁'
195+
group by company_name
196+
order by cn desc;

SZTcard/etl.py

Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,41 @@
1+
#!/usr/bin/env python3
2+
# -*- coding: utf-8 -*-
3+
# @Time : 2021/1/8 20:03
4+
# @Author : way
5+
# @Site :
6+
# @Describe: 数据处理 https://opendata.sz.gov.cn/data/dataSet/toDataDetails/29200_00403601
7+
8+
import json
9+
import pandas as pd
10+
11+
############################################# 解析 json 数据文件 ##########################################################
12+
path = r"C:\Users\Administrator\Desktop\2018record3.jsons"
13+
data = []
14+
with open(path, 'r', encoding='utf-8') as f:
15+
for line in f.readlines():
16+
data += json.loads(line)['data']
17+
data = pd.DataFrame(data)
18+
columns = ['card_no', 'deal_date', 'deal_type', 'deal_money', 'deal_value', 'equ_no', 'company_name', 'station', 'car_no', 'conn_mark', 'close_date']
19+
data = data[columns] # 调整字段顺序
20+
data.info()
21+
22+
############################################# 输出处理 ##########################################################
23+
# 全部都是 交通运输 的刷卡数据
24+
print(data['company_name'].unique())
25+
26+
# 删除重复值
27+
# print(data[data.duplicated()])
28+
data.drop_duplicates(inplace=True)
29+
data.reset_index(drop=True, inplace=True)
30+
31+
# 缺失值
32+
# 只有线路站点和车牌号两个字段存在为空,不做处理
33+
# print(data.isnull().sum())
34+
35+
# 去掉脏数据
36+
data = data[data['deal_date'] > '2018-08-31']
37+
############################################# 数据保存 ##########################################################
38+
print(data.info)
39+
40+
# 数据保存为 csv
41+
data.to_csv('SZTcard.csv', index=False, header=None)

SZTcard/table.sql

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
-- 建表
2+
CREATE TABLE `sztcard`(
3+
`card_no` string COMMENT '卡号',
4+
`deal_date` string COMMENT '交易日期时间',
5+
`deal_type` string COMMENT '交易类型',
6+
`deal_money` float COMMENT '交易金额',
7+
`deal_value` float COMMENT '交易值',
8+
`equ_no` string COMMENT '设备编码',
9+
`company_name` string COMMENT '公司名称',
10+
`station` string COMMENT '线路站点',
11+
`car_no` string COMMENT '车牌号',
12+
`conn_mark` string COMMENT '联程标记',
13+
`close_date` string COMMENT '结算日期'
14+
)
15+
row format delimited
16+
fields terminated by ','
17+
lines terminated by '\n';
18+
19+
-- 加载数据
20+
LOAD DATA INPATH '/tmp/SZTcard.csv' OVERWRITE INTO TABLE sztcard;

0 commit comments

Comments
 (0)