Skip to content

Commit 71cffbc

Browse files
authored
Add files via upload
1 parent e18af71 commit 71cffbc

File tree

4 files changed

+305
-0
lines changed

4 files changed

+305
-0
lines changed
Lines changed: 71 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,71 @@
1+
import os
2+
3+
import numpy as np
4+
import pandas as pd
5+
6+
np.random.seed(2021)
7+
8+
launch = pd.read_csv('raw_data/wsdm_train_data/app_launch_logs.csv')
9+
test = pd.read_csv('raw_data/test-a.csv')
10+
11+
launch.date.min(), launch.date.max()
12+
13+
launch_grp = launch.groupby('user_id').agg(launch_date=('date', list),
14+
launch_type=('launch_type',
15+
list)).reset_index()
16+
17+
18+
def choose_end_date(launch_date):
19+
n1, n2 = min(launch_date), max(launch_date)
20+
if n1 < n2 - 7:
21+
end_date = np.random.randint(n1, n2 - 7)
22+
else:
23+
end_date = np.random.randint(100, 222 - 7)
24+
return end_date
25+
26+
27+
def get_label(row):
28+
launch_list = row.launch_date
29+
end = row.end_date
30+
label = sum([1 for x in set(launch_list) if end < x < end + 8])
31+
return label
32+
33+
34+
launch_grp['end_date'] = launch_grp.launch_date.apply(choose_end_date)
35+
launch_grp['label'] = launch_grp.apply(get_label, axis=1)
36+
37+
train = launch_grp[['user_id', 'end_date', 'label']]
38+
train
39+
40+
test['label'] = -1
41+
test
42+
43+
data = pd.concat([train, test], ignore_index=True)
44+
data
45+
46+
data = data.merge(launch_grp[['user_id', 'launch_type', 'launch_date']],
47+
how='left',
48+
on='user_id')
49+
data
50+
51+
52+
# get latest 32 days([end_date-31, end_date]) launch type sequence
53+
# 0 for not launch, 1 for launch_type=0, and 2 for launch_type=1
54+
def gen_launch_seq(row):
55+
seq_sort = sorted(zip(row.launch_type, row.launch_date),
56+
key=lambda x: x[1])
57+
seq_map = {d: t + 1 for t, d in seq_sort}
58+
end = row.end_date
59+
seq = [seq_map.get(x, 0) for x in range(end - 31, end + 1)]
60+
return seq
61+
62+
63+
data['launch_seq'] = data.apply(gen_launch_seq, axis=1)
64+
data
65+
66+
data.head()
67+
68+
data.drop(columns=['launch_date', 'launch_type'], inplace=True)
69+
70+
os.makedirs('data', exist_ok=True)
71+
data.to_pickle('data/all_data.pkl')
Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,27 @@
1+
import torch
2+
import torch.nn as nn
3+
4+
5+
class AQYModel(nn.Module):
6+
def __init__(self):
7+
super(AQYModel, self).__init__()
8+
9+
self.user_id_embedding = nn.Embedding(600000 + 1, 16)
10+
self.launch_seq_gru = nn.GRU(input_size=1,
11+
hidden_size=16,
12+
batch_first=True)
13+
14+
self.fc = nn.Linear(32, 1)
15+
16+
def forward(self, user_id, launch_seq):
17+
user_id_emb = self.user_id_embedding(user_id)
18+
19+
launch_seq = launch_seq.reshape((-1, 32, 1))
20+
launch_seq, _ = self.launch_seq_gru(launch_seq)
21+
launch_seq = torch.mean(launch_seq, dim=1)
22+
23+
fc_input = torch.cat([user_id_emb, launch_seq], 1)
24+
25+
pred = self.fc(fc_input)
26+
27+
return pred
Lines changed: 96 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,96 @@
1+
import numpy as np
2+
import torch
3+
from torch.utils.data import Dataset
4+
from tqdm import tqdm
5+
6+
7+
def cal_score(pred, label):
8+
pred = np.array(pred)
9+
label = np.array(label)
10+
11+
diff = (pred - label) / 7
12+
diff = np.abs(diff)
13+
14+
score = 100 * (1 - np.mean(diff))
15+
return score
16+
17+
18+
class AQYDataset(Dataset):
19+
def __init__(self, df, device):
20+
self.user_id_list = df['user_id'].values
21+
22+
self.launch_seq_list = df['launch_seq'].values
23+
24+
self.label_list = df['label'].values
25+
26+
def __getitem__(self, index):
27+
user_id = self.user_id_list[index]
28+
29+
launch_seq = np.array(self.launch_seq_list[index])
30+
31+
label = self.label_list[index]
32+
33+
return user_id, launch_seq, label
34+
35+
def __len__(self):
36+
return len(self.user_id_list)
37+
38+
39+
def fit(model, train_loader, optimizer, criterion, device):
40+
model.train()
41+
42+
pred_list = []
43+
label_list = []
44+
45+
for user_id, launch_seq, label in tqdm(train_loader):
46+
user_id = user_id.long().to(device)
47+
launch_seq = launch_seq.float().to(device)
48+
label = torch.tensor(label).float().to(device)
49+
50+
pred = model(user_id, launch_seq)
51+
52+
loss = criterion(pred.squeeze(), label)
53+
loss.backward()
54+
optimizer.step()
55+
model.zero_grad()
56+
57+
pred_list.extend(pred.squeeze().cpu().detach().numpy())
58+
label_list.extend(label.squeeze().cpu().detach().numpy())
59+
60+
score = cal_score(pred_list, label_list)
61+
62+
return score
63+
64+
65+
def validate(model, val_loader, device):
66+
model.eval()
67+
68+
pred_list = []
69+
label_list = []
70+
71+
for user_id, launch_seq, label in tqdm(val_loader):
72+
user_id = user_id.long().to(device)
73+
launch_seq = launch_seq.float().to(device)
74+
label = torch.tensor(label).float().to(device)
75+
76+
pred = model(user_id, launch_seq)
77+
78+
pred_list.extend(pred.squeeze().cpu().detach().numpy())
79+
label_list.extend(label.squeeze().cpu().detach().numpy())
80+
81+
score = cal_score(pred_list, label_list)
82+
83+
return score
84+
85+
86+
def predict(model, test_loader, device):
87+
model.eval()
88+
test_pred = []
89+
for user_id, launch_seq, _ in tqdm(test_loader):
90+
user_id = user_id.long().to(device)
91+
launch_seq = launch_seq.float().to(device)
92+
93+
pred = model(user_id, launch_seq).squeeze()
94+
test_pred.extend(pred.cpu().detach().numpy())
95+
96+
return test_pred
Lines changed: 111 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,111 @@
1+
import copy
2+
import os
3+
import warnings
4+
5+
import numpy as np
6+
import pandas as pd
7+
import torch
8+
import torch.nn as nn
9+
from sklearn.preprocessing import LabelEncoder
10+
from torch.utils.data import DataLoader
11+
12+
from model import AQYModel
13+
from model_tools import AQYDataset, fit, predict, validate
14+
15+
warnings.filterwarnings('ignore')
16+
17+
18+
def random_seed(seed):
19+
np.random.seed(seed)
20+
torch.manual_seed(seed)
21+
torch.cuda.manual_seed_all(seed)
22+
23+
torch.backends.cudnn.deterministic = True
24+
torch.backends.cudnn.benchmark = False
25+
26+
27+
random_seed(2021)
28+
29+
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
30+
device
31+
32+
data = pd.read_pickle('data/all_data.pkl')
33+
data.head()
34+
35+
user_lbe = LabelEncoder()
36+
data['user_id'] = user_lbe.fit_transform(data['user_id'])
37+
data['user_id'] = data['user_id'] + 1
38+
39+
train = data[data['label'] != -1]
40+
test = data[data['label'] == -1]
41+
42+
train = train.sample(frac=1, random_state=2021).reset_index(drop=True)
43+
44+
train_shape = int(train.shape[0] * 0.9)
45+
46+
valid = train.iloc[train_shape:]
47+
train = train.iloc[:train_shape]
48+
49+
print(train.shape, valid.shape, test.shape)
50+
51+
train_dataset = AQYDataset(train, device)
52+
valid_dataset = AQYDataset(valid, device)
53+
test_dataset = AQYDataset(test, device)
54+
55+
train_loader = DataLoader(train_dataset,
56+
batch_size=128,
57+
shuffle=True,
58+
num_workers=4)
59+
valid_loader = DataLoader(valid_dataset,
60+
batch_size=128,
61+
shuffle=False,
62+
num_workers=4)
63+
test_loader = DataLoader(test_dataset,
64+
batch_size=128,
65+
shuffle=False,
66+
num_workers=4)
67+
68+
model = AQYModel().to(device)
69+
70+
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
71+
criterion = nn.MSELoss()
72+
73+
best_val_score = float('-inf')
74+
last_improve = 0
75+
best_model = None
76+
77+
for epoch in range(10):
78+
train_score = fit(model, train_loader, optimizer, criterion, device)
79+
val_score = validate(model, valid_loader, device)
80+
81+
if val_score > best_val_score:
82+
best_val_score = val_score
83+
best_model = copy.deepcopy(model)
84+
last_improve = epoch
85+
improve = '*'
86+
else:
87+
improve = ''
88+
89+
if epoch - last_improve > 3:
90+
break
91+
92+
print(
93+
f'Epoch: {epoch} Train Score: {train_score}, Valid Score: {val_score} {improve}'
94+
)
95+
96+
model = best_model
97+
98+
valid['pred'] = predict(model, valid_loader, device)
99+
valid['diff'] = valid['label'] - valid['pred']
100+
valid['diff'] = abs(valid['diff']) / 7
101+
score = 100 * (1 - valid['diff'].mean())
102+
print(f'Valid Score: {score}')
103+
104+
os.makedirs('sub', exist_ok=True)
105+
106+
test['pred'] = predict(model, test_loader, device)
107+
test = test[['user_id', 'pred']]
108+
test['user_id'] = test['user_id'] - 1
109+
test['user_id'] = user_lbe.inverse_transform(test['user_id'])
110+
111+
test.to_csv(f'sub/{score}.csv', index=False, header=False, float_format="%.2f")

0 commit comments

Comments
 (0)