本文主要通过视频演示如何在御膳房平台进行第二赛季的资金流入流出预测,主要参考了[资金流入流出预测]第二赛季参赛(平台使用)攻略之混沌队的使用说明。
视频演示
基本步骤:
- 在“数据开发”,使用SQL创建表
lt_user_balance_table_sum_baseline
(每天的购买赎回总额)和lt_basic_feature4to8_baseline
(4至8月的特征及其购买赎回总额); - 在“算法平台”,使用R生成9月的特征表
lt_basic_feature9_baseline
; - 在“算法平台”,使用R中的线性回归预测9月的购买赎回情况表
tc_comp_predict_table
,系统自动根据此表评分。
相关代码
创建表[demo_create_table]
注意事项:
- 使用平台提供的数据表须加上项目名
tianchi_finance
,比如tianchi_finance.user_balance_table
; - “算法平台”不能直接使用系统的表,只能使用自己生成的表,比如
lt_user_balance_table_sum_baseline
。
--日申购赎回汇总表
create table if not exists lt_user_balance_table_sum_baseline(
report_date string comment '日期',
total_purchase_amt bigint comment '日申购总额',
total_redeem_amt bigint comment '日赎回总额'
) comment '日申购赎回汇总表' partitioned by (ds string); --ds:不同的分区,不同的时间区间。
insert overwrite table lt_user_balance_table_sum_baseline partition(ds='all')
select
report_date,
sum(total_purchase_amt) total_purchase_amt,
sum(total_redeem_amt) total_redeem_amt
from tianchi_finance.user_balance_table --内部赛主办方提供的申购赎回记录表。
group by report_date;
--例如要用4-8月数据作为训练集
insert overwrite table lt_user_balance_table_sum_baseline partition(ds='45678month')
select report_date, total_purchase_amt, total_redeem_amt
from lt_user_balance_table_sum_baseline
where ds = 'all' and report_date >='20140401' and report_date < '20140901';
--构建训练集基础特征:
create table if not exists lt_basic_feature4to8_baseline as
select
t1.report_date,
case when t1.dayOfWeek=0 then 1 else 0 end as monday,
case when t1.dayOfWeek=1 then 1 else 0 end as tuesday,
case when t1.dayOfWeek=2 then 1 else 0 end as wednesday,
case when t1.dayOfWeek=3 then 1 else 0 end as thursday,
case when t1.dayOfWeek=4 then 1 else 0 end as friday,
case when t1.dayOfWeek=5 then 1 else 0 end as saturday,
case when t1.dayOfWeek=6 then 1 else 0 end as sunday,
total_purchase_amt,
total_redeem_amt
from (
select
report_date,
weekday(to_date(report_date,"yyyyMMdd")) dayOfWeek,
total_purchase_amt,
total_redeem_amt
from lt_user_balance_table_sum_baseline
where ds = '45678month'
) t1;
生成9月特征[demo_produce_features]
注意事项:
- R脚本必须要添加“ODPS源”,虽然有时实际上并未使用;
- 使用星期作为特征,例如,某天是
monday
,对应值为1,其余全为0; - R脚本输出的“ODPS目标”,自动根据
data.frame
的格式生成特征表lt_basic_feature9_baseline
。
# 请链接输入数据
# 链接完成后,系统会自动生成映射代码,将输入数据映射成变量参数,用户可直接使用
# 切记不可修改系统生成代码,否则运行将报错
# 端口1的表数据映射成dataset1
dataset1 <- pai.inputPort(1) # class: data.frame
start_date <- as.Date("2014-09-01")
num_days <- 30
num_features <- 7
date_Ymd <- as.integer(format(as.Date(start_date : (start_date + num_days - 1),
origin="1970-01-01"), "%Y%m%d"))
september_features = matrix(0, num_days, num_features + 1)
september_features[, 1] <- date_Ymd
feature_indexs <- (date_Ymd + 3) %% 7 + 2
for (i in 1 : num_days)
{
september_features[i, feature_indexs[i]] <- 1
}
colnames(september_features) <- c("report_date",
"monday",
"tuesday",
"wednesday",
"thursday",
"friday",
"saturday",
"sunday")
september_features <- data.frame(september_features)
for (i in 1 : (num_features + 1))
{
september_features[, i] <- as.integer(september_features[, i])
}
# 用户指定数据变量dataname(class:data.frame)到输出端口
# 平台会将该数据生成ODPS表
# dataname务必修改成自己的变量名称
pai.outputPort(1, september_features)
回归预测[demo_predict]
注意事项:
- “R脚本”框上方左一小圆点连线表示
dataset1 <- pai.inputPort(1)
,左二小圆点连线表示dataset2 <- pai.inputPort(2)
,“R脚本”框共支持4个输入; - R脚本输出的“ODPS目标”,自动根据
data.frame
格式生成表tc_comp_predict_table
(系统会自动根据此表评分)。
# 请链接输入数据
# 链接完成后,系统会自动生成映射代码,将输入数据映射成变量参数,用户可直接使用
# 切记不可修改系统生成代码,否则运行将报错
# 端口2的表数据映射成dataset2
dataset2 <- pai.inputPort(2) # class: data.frame
# 端口1的表数据映射成dataset1
dataset1 <- pai.inputPort(1) # class: data.frame
result <- data.frame(report_date = as.integer(dataset2$report_date))
# predict purchase
model <- lm(total_purchase_amt ~
monday + tuesday + wednesday + thursday + friday +
saturday + sunday,
dataset1)
model.predict <- data.frame(predict(model, dataset2,
interval="prediction",
level=0.95, se.fit=FALSE))
result$purchase <- as.integer(round(model.predict$fit))
# predict redeem
model <- lm(total_redeem_amt ~
monday + tuesday + wednesday + thursday + friday +
saturday + sunday,
dataset1)
model.predict <-data.frame(predict(model, dataset2,
interval="prediction",
level=0.95, se.fit=FALSE))
result$redeem <- as.integer(round(model.predict$fit))
# 用户指定数据变量dataname(class:data.frame)到输出端口
# 平台会将该数据生成ODPS表
# dataname务必修改成自己的变量名称
pai.outputPort(1, result)
实用技能
查看平台的R版本
注意:由于平台禁止了调用installed.packages
、.packages
和.libPaths
函数,目前还无法得知平台安装了哪些R包。
R_version <- data.frame(name = c(names(version)))
for (i in 1 : length(version))
{
R_version[i, 2] <- as.character(version[[i]])
}
R_version[1] <- as.character(R_version[[1]])
colnames(R_version)[2] <- 'value'
硬写入数据到评分表
本机离线得到的预测结果,直接利用“算法平台”的R代码,写入评分表。
manual_input <- c(20140901,973211410,1121867054,
20140902,1065557799,1048471809,
20140903,996550847,998207189,
20140904,934368548,939173481,
20140905,805889443,834056370,
20140906,559946861,582517716,
20140907,623372630,666986337,
20140908,973211410,1121867054,
20140909,1065557799,1048471809,
20140910,996550847,998207189,
20140911,934368548,939173481,
20140912,805889443,834056370,
20140913,559946861,582517716,
20140914,623372630,666986337,
20140915,973211410,1121867054,
20140916,1065557799,1048471809,
20140917,996550847,998207189,
20140918,934368548,939173481,
20140919,805889443,834056370,
20140920,559946861,582517716,
20140921,623372630,666986337,
20140922,973211410,1121867054,
20140923,1065557799,1048471809,
20140924,996550847,998207189,
20140925,934368548,939173481,
20140926,805889443,834056370,
20140927,559946861,582517716,
20140928,623372630,666986337,
20140929,973211410,1121867054,
20140930,1065557799,1048471809);
num_days <- 30
num_result_col <- 3
result <- t(matrix(manual_input, num_result_col, num_days))
colnames(result) <- c('report_date', 'purchase', 'redeem')
result <- data.frame(result)
result$report_date <- as.character(result$report_date)
result$purchase <- as.integer(result$purchase)
result$redeem <- as.integer(result$redeem)
问题处理
子查询记录超过1000
FAILED: ODPS-0130111:Subquery partition pruning exception - records returned from subquery exceeded limit of 1000
解决办法:用JOIN
改写IN
。
原来的IN
查询:
CREATE TABLE
IF NOT EXISTS vip_balance_table AS SELECT
*
FROM
user_balance_table
WHERE
user_id IN (
SELECT
user_id
FROM
(
SELECT
user_id,
MAX (total_purchase_amt) AS max_purchase,
MAX (total_redeem_amt) AS max_redeem
FROM
user_balance_table
GROUP BY
user_id
) T
WHERE
max_purchase >= 2000000
AND max_redeem >= 2000000
);
用JOIN
改写:
CREATE TABLE vip_balance_table AS SELECT
user_balance_table.*
FROM
user_balance_table
JOIN (
SELECT DISTINCT
user_id
FROM
(
SELECT
user_id,
MAX (total_purchase_amt) AS max_purchase,
MAX (total_redeem_amt) AS max_redeem
FROM
user_balance_table
GROUP BY
user_id
) T
WHERE
max_purchase >= 2000000
AND max_redeem >= 2000000
) vip_user_id ON (user_balance_table.user_id = vip_user_id.user_id)