github:https://github.com/huxiaoman7/PaddleAI
Paddle models:https://github.com/PaddlePaddle/models
欢迎大家star、fork、提issue和贡献新案例~
数据准备
cd data && ./download.sh && cd ..
.── CriteoDataset
│
├── train
│
├── test
│
├── infer
1 class CriteoDataset(Dataset):
2 def __init__(self, sparse_feature_dim):
3 self.cont_min_ = [0, -3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
4 self.cont_max_ = [20, 600, 100, 50, 64000, 500, 100, 50, 500, 10, 10, 10, 50]
5 self.cont_diff_ = [20, 603, 100, 50, 64000, 500, 100, 50, 500, 10, 10, 10, 50]
6 self.hash_dim_ = sparse_feature_dim
7 # here, training data are lines with line_index < train_idx_
8 self.train_idx_ = 41256555
9 self.continuous_range_ = range(1, 14)
10 self.categorical_range_ = range(14, 40)
11
12 def _reader_creator(self, file_list, is_train, trainer_num, trainer_id):
13 def reader():
14 for file in file_list:
15 with open(file, 'r') as f:
16 line_idx = 0
17 for line in f:
18 line_idx += 1
19 if is_train and line_idx > self.train_idx_:
20 break
21 elif not is_train and line_idx <= self.train_idx_:
22 continue
23 if line_idx % trainer_num != trainer_id:
24 continue
25 features = line.rstrip('\n').split('\t')
26 dense_feature = []
27 sparse_feature = []
28 for idx in self.continuous_range_:
29 if features[idx] == '':
30 dense_feature.append(0.0)
31 else:
32 dense_feature.append((float(features[idx]) - self.cont_min_[idx - 1]) / self.cont_diff_[idx - 1])
33 for idx in self.categorical_range_:
34 sparse_feature.append([hash(str(idx) + features[idx]) % self.hash_dim_])
35
36 label = [int(features[0])]
37 yield [dense_feature] + sparse_feature + [label]
38
39 return reader
python train.py \
--train_data_path data/raw/train.txt \
2>&1 | tee train.log
运行方式
sh cluster_train.sh
调用接口
1 pe = fluid.ParallelExecutor(
2 use_cuda=False,
3 loss_name=loss.name,
4 main_program=train_program,
5 build_strategy=build_strategy,
6 exec_strategy=exec_strategy)
7 logger.info("run dist training")
8 t = fluid.DistributeTranspiler()
9 t.transpile(args.trainer_id, pservers=args.endpoints, trainers=args.trainers)
10 if args.role == "pserver" or args.role == "PSERVER":
11 logger.info("run pserver")
12 prog = t.get_pserver_program(args.current_endpoint)
13 startup = t.get_startup_program(args.current_endpoint, pserver_program=prog)
14 exe = fluid.Executor(fluid.CPUPlace())
15 exe.run(startup)
16 exe.run(prog)
17 elif args.role == "trainer" or args.role == "TRAINER":
18 logger.info("run trainer")
19 train_prog = t.get_trainer_program()
20 train_loop(args, train_prog, py_reader, loss, auc_var, batch_auc_var,
21 args.trainers, args.trainer_id)
注:batch_size由默认的1000修改为64,可提高auc
2019-05-11 08:34:19,678-INFO: TRAIN --> pass: 9 batch: 2577 loss: 0.467225006104 auc: 0.787909292672, batch_auc: 0.797377570934
pass_id: 0, pass_time_cost: 3150.447569
pass_id: 1, pass_time_cost: 3177.322331
pass_id: 2, pass_time_cost: 3174.676812
pass_id: 3, pass_time_cost: 3209.558880
pass_id: 4, pass_time_cost: 3134.910369
pass_id: 5, pass_time_cost: 3202.956675
pass_id: 6, pass_time_cost: 3169.575809
pass_id: 7, pass_time_cost: 3210.294044
pass_id: 8, pass_time_cost: 3039.102302
pass_id: 9, pass_time_cost: 3036.933163
python infer.py \
--model_path models/pass-0/ \
--data_path data/raw/valid.txt
预测结果:
2019-05-13 09:35:49,177-INFO: TEST --> batch: 4500 loss: [0.46127334] auc: [0.78797872]
label | 数量 | 比例 | |
---|---|---|---|
负样本 | 0 | 34095179 | 0.74377662 |
正样本 | 1 | 11745438 | 0.25622338 |
评估 | batch_size | batch_1000 | batch_1000 | batch_64 | batch_64 |
---|---|---|---|---|---|
优化方式 | 评估 | 一层网络 | 三层网络 | 一层网络 | 三层网络 |
mini_demo | time | 33s | 35s | 97s | 103s |
auc | 0.50234167 | 0.54893279 | 0.721332392 | 0.74322927 |
batch_size | time | auc | |
---|---|---|---|
demo | 64 | 1133s | 0.73777626 |
全量 | 64 | 3150s | 0.81093872 |
由以上两个表格可知: