add SZTcard

way · way · commit 5fe7e21fc693 · 2021-01-11T17:38:03.000+08:00
diff --git a/.gitignore b/.gitignore
@@ -1,3 +1,4 @@
 /.idea/
 /rent.db
-*html
+*html
+*csv
diff --git a/README.md b/README.md
@@ -17,8 +17,10 @@
 | [1 亿条淘宝用户行为数据分析](https://github.com/TurboWay/bigdata_analyse/blob/master/UserBehaviorFromTaobao_Batch/用户行为数据分析.md)       | 清洗 hive  + 分析 hive + 可视化 echarts | [阿里云](https://tianchi.aliyun.com/dataset/dataDetail?dataId=649&userId=1) 或者 [百度网盘](https://pan.baidu.com/s/15Ss-nDMA120EHhuwpzYm0g) 提取码：5ipq |
 | [1000 万条淘宝用户行为数据实时分析](https://github.com/TurboWay/bigdata_analyse/blob/master/UserBehaviorFromTaobao_Stream/用户行为数据实时分析.md)       | 数据源 kafka  + 实时分析 flink + 可视化（es + kibana）  | [百度网盘](https://pan.baidu.com/s/1wDVQpRV7giIlLJJgRZAInQ)  提取码：gja5  |
 | [300 万条《野蛮时代》的玩家数据分析](https://github.com/TurboWay/bigdata_analyse/blob/master/AgeOfBarbarians/野蛮时代数据分析.md)       | 清洗 pandas  + 分析 mysql + 可视化 pyecharts | [百度网盘](https://pan.baidu.com/s/1Mi5lvGDF405Nk8Y2BZDzdQ) 提取码：paq4 |
+| [130 万条深圳通刷卡数据分析](https://github.com/TurboWay/bigdata_analyse/blob/master/SZTcard/深圳通刷卡数据分析.md)       | 清洗 pandas  + 分析 impala + 可视化 dbeaver | [百度网盘](https://pan.baidu.com/s/1WslwKXKhVH1q_6u4SvuKkQ) 提取码：t561 |
 | [7000 条租房数据分析](https://github.com/TurboWay/bigdata_analyse/blob/master/RentFromDanke/租房数据分析.md)       | 清洗 pandas  + 分析 sqlite + 可视化 matplotlib  | [百度网盘](https://pan.baidu.com/s/1l1x5qurJdkyUxAuhknj_Qw) 提取码：9en3 |
 
 ## refer
 
 > 1. [https://tianchi.aliyun.com/dataset/](https://tianchi.aliyun.com/dataset/)
+> 2. [https://opendata.sz.gov.cn/data/api/toApiDetails/29200_00403601](https://opendata.sz.gov.cn/data/api/toApiDetails/29200_00403601)
diff --git a/SZTcard/analyse.sql b/SZTcard/analyse.sql
@@ -0,0 +1,196 @@
+-- 乘客主题
+
+-- (整体) 通勤费用
+select  '整体' deal_type,
+        count(1) as cnt,
+        sum(deal_money) / 100 as total,
+		avg(deal_money) / 100 as per
+from sztcard
+where deal_type in ('地铁出站', '巴士')
+union all
+select  deal_type,
+        count(1) as cnt,
+        sum(deal_money) / 100 as total,
+		avg(deal_money) / 100 as per
+from sztcard
+where deal_type in ('地铁出站', '巴士')
+group by deal_type;
+
+-- 优惠情况
+select case when a.distinct_count = 1 then '全票'
+            when a.distinct_count = 0.95 then '9.5 折'
+            when a.distinct_count >= 0.9 then '9 折'
+            when a.distinct_count >= 0.85 then '8.5 折'
+            when a.distinct_count >= 0.75 then '7.5 折'
+            when a.distinct_count >= 0.5 then '半票'
+            when a.distinct_count = 0 then '免票'
+            end as distinct_count_range,
+       sum(cn) as cn
+from(
+select deal_money / deal_value as distinct_count, count(1) as cn
+from sztcard
+where deal_value  > 0
+group by deal_money / deal_value
+) as a
+group by case when a.distinct_count = 1 then '全票'
+            when a.distinct_count = 0.95 then '9.5 折'
+            when a.distinct_count >= 0.9 then '9 折'
+            when a.distinct_count >= 0.85 then '8.5 折'
+            when a.distinct_count >= 0.75 then '7.5 折'
+            when a.distinct_count >= 0.5 then '半票'
+            when a.distinct_count = 0 then '免票'
+            end;
+
+-- (整体) 出行时间分布
+select  hour(deal_date) as h, count(1) as ct
+from sztcard
+where deal_type in ('地铁入站', '巴士')
+group by hour(deal_date)
+order by h;
+
+-- (地铁) 通勤时间
+with tt as(
+select *, row_number() over( partition by card_no order by deal_date) as px
+from sztcard
+where deal_type rlike '地铁'
+),
+tt2 as(
+select t1.card_no,
+       t1.deal_type as in_type, t1.company_name as in_company, t1.station as in_station, t1.deal_date as in_date,
+       t2.deal_type as out_type, t2.company_name as out_company, t2.station as out_station, t2.deal_date as out_date,
+       unix_timestamp(t2.deal_date) - unix_timestamp(t1.deal_date) as diff_sec
+from tt as t1
+inner join tt as t2 on t1.card_no = t2.card_no and t1.px = t2.px - 1
+where t2.deal_type = '地铁出站'
+and t1.deal_type = '地铁入站'
+and t1.station <> t2.station
+and substring(t1.deal_date, 1, 10) = '2018-09-01'
+and substring(t2.deal_date, 1, 10) = '2018-09-01'
+)
+
+select avg(diff_sec)/60 from tt2;
+
+
+-- 地铁主题
+
+-- (基于站点) 进站 top
+select station, count(1) as cn
+from sztcard
+where deal_type = '地铁入站'
+and station > ''
+group by station
+order by cn desc
+limit 10;
+
+-- (基于站点) 出站 top
+select station, count(1) as cn
+from sztcard
+where deal_type = '地铁出站'
+and station > ''
+group by station
+order by cn desc
+limit 10;
+
+-- (基于站点) 进出站 top
+select station, count(1) as cn
+from sztcard
+where deal_type in ('地铁出站', '地铁入站')
+and station > ''
+group by station
+order by cn desc
+limit 10;
+
+-- (基于站点) 站点收入 top
+select station, sum(deal_money) / 100 as sm
+from sztcard
+where deal_type in ('地铁出站', '地铁入站')
+and station > ''
+group by station
+order by sm desc
+limit 10;
+
+-- (基于线路) 运输贡献度 top
+-- 进站算一次，出站并且联程算一次
+select company_name, count(1) as cn
+from sztcard
+where company_name rlike '地铁'
+and (deal_type = '地铁出站' and conn_mark = '1' or deal_type = '地铁入站')
+group by company_name
+order by cn desc;
+
+-- (基于线路) 运输效率 top
+-- 每条线路单程直达乘客耗时平均值排行榜
+with tt as(
+select *, row_number() over( partition by card_no order by deal_date) as px
+from sztcard
+where deal_type rlike '地铁'
+),
+tt2 as(
+select t1.card_no,
+       t1.deal_type as in_type, t1.company_name as in_company, t1.station as in_station, t1.deal_date as in_date,
+       t2.deal_type as out_type, t2.company_name as out_company, t2.station as out_station, t2.deal_date as out_date,
+       unix_timestamp(t2.deal_date) - unix_timestamp(t1.deal_date) as diff_sec
+from tt as t1
+inner join tt as t2 on t1.card_no = t2.card_no and t1.px = t2.px - 1
+where t2.deal_type = '地铁出站'
+and t1.deal_type = '地铁入站'
+and t1.station <> t2.station
+and substring(t1.deal_date, 1, 10) = '2018-09-01'
+and substring(t2.deal_date, 1, 10) = '2018-09-01'
+)
+
+select in_company, avg(diff_sec) / 60 avg_min
+from tt2
+where in_company = out_company
+group by in_company
+order by avg_min;
+
+-- (基于线路) 换乘比例 top
+-- 每线路换乘出站乘客百分比排行榜
+with tt as(
+select *, row_number() over( partition by card_no order by deal_date) as px
+from sztcard
+where deal_type rlike '地铁'
+),
+tt2 as(
+select t1.card_no,
+       t1.deal_type as in_type, t1.company_name as in_company, t1.station as in_station, t1.deal_date as in_date,
+       t2.deal_type as out_type, t2.company_name as out_company, t2.station as out_station, t2.deal_date as out_date,
+       t2.conn_mark,
+       unix_timestamp(t2.deal_date) - unix_timestamp(t1.deal_date) as diff_sec
+from tt as t1
+inner join tt as t2 on t1.card_no = t2.card_no and t1.px = t2.px - 1
+where t2.deal_type = '地铁出站'
+and t1.deal_type = '地铁入站'
+and t1.station <> t2.station
+and substring(t1.deal_date, 1, 10) = '2018-09-01'
+and substring(t2.deal_date, 1, 10) = '2018-09-01'
+)
+
+select out_company, sum(case when conn_mark = '1' then 1 else 0 end) / count(1) as per
+from tt2
+group by out_company
+order by per desc;
+
+-- (基于线路) 线路收入 top
+select company_name, sum(deal_money) / 100 as sm
+from sztcard
+where deal_type rlike '地铁'
+group by company_name
+order by sm desc;
+
+-- 巴士主题
+
+-- (基于公司) 巴士公司收入 top
+select company_name, sum(deal_money) / 100 as sm
+from sztcard
+where deal_type not rlike '地铁'
+group by company_name
+order by sm desc;
+
+-- (基于公司) 巴士公司贡献度 top
+select company_name, count(1) as cn
+from sztcard
+where deal_type not rlike '地铁'
+group by company_name
+order by cn desc;
diff --git a/SZTcard/etl.py b/SZTcard/etl.py
@@ -0,0 +1,41 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+# @Time : 2021/1/8 20:03
+# @Author : way
+# @Site : 
+# @Describe: 数据处理 https://opendata.sz.gov.cn/data/dataSet/toDataDetails/29200_00403601
+
+import json
+import pandas as pd
+
+############################################# 解析 json 数据文件 ##########################################################
+path = r"C:\Users\Administrator\Desktop\2018record3.jsons"
+data = []
+with open(path, 'r', encoding='utf-8') as f:
+    for line in f.readlines():
+        data += json.loads(line)['data']
+data = pd.DataFrame(data)
+columns = ['card_no', 'deal_date', 'deal_type', 'deal_money', 'deal_value', 'equ_no', 'company_name', 'station', 'car_no', 'conn_mark', 'close_date']
+data = data[columns]  # 调整字段顺序
+data.info()
+
+############################################# 输出处理 ##########################################################
+# 全部都是 交通运输 的刷卡数据
+print(data['company_name'].unique())
+
+# 删除重复值
+# print(data[data.duplicated()])
+data.drop_duplicates(inplace=True)
+data.reset_index(drop=True, inplace=True)
+
+# 缺失值
+# 只有线路站点和车牌号两个字段存在为空，不做处理
+# print(data.isnull().sum())
+
+# 去掉脏数据
+data = data[data['deal_date'] > '2018-08-31']
+############################################# 数据保存 ##########################################################
+print(data.info)
+
+# 数据保存为 csv
+data.to_csv('SZTcard.csv', index=False, header=None)
diff --git a/SZTcard/table.sql b/SZTcard/table.sql
@@ -0,0 +1,20 @@
+-- 建表
+CREATE TABLE `sztcard`(
+    `card_no` string COMMENT '卡号',
+    `deal_date` string COMMENT '交易日期时间',
+    `deal_type` string COMMENT '交易类型',
+    `deal_money` float COMMENT '交易金额',
+    `deal_value` float COMMENT '交易值',
+    `equ_no` string COMMENT '设备编码',
+    `company_name` string COMMENT '公司名称',
+    `station` string COMMENT '线路站点',
+    `car_no` string COMMENT '车牌号',
+    `conn_mark` string COMMENT '联程标记',
+    `close_date` string COMMENT '结算日期'
+)
+row format delimited
+fields terminated by ','
+lines terminated by '\n';
+
+-- 加载数据
+LOAD DATA INPATH '/tmp/SZTcard.csv' OVERWRITE INTO TABLE sztcard;
diff --git a/SZTcard/深圳通刷卡数据分析.md b/SZTcard/深圳通刷卡数据分析.md

-Original file line number
+Diff line change
@@ @@ -1,3 +1,4 @@ @@
 /.idea/
 /rent.db
 -*html
 +*html
 +*csv