LSTM时序模型,每个样本5个时间步,每个时间步8个特征,(num_samples, time_steps, num_features)。


4.1 sql提取数据

select tmp3.match_id as match_id, concat(tmp3.feature4,,,tmp3.feature3,,,tmp3.feature2,,,tmp3.feature1,,,tmp3.feature) as feature, tmp3.ballflag as ballflag from ( select tmp2.match_id as match_id, tmp2.feature as feature, LAG(feature,1) over(partition by tmp2.match_id order by tmp2.time ) as feature1, LAG(feature,2) over(partition by tmp2.match_id order by tmp2.time ) as feature2, LAG(feature,3) over(partition by tmp2.match_id order by tmp2.time ) as feature3, LAG(feature,4) over(partition by tmp2.match_id order by tmp2.time ) as feature4, tmp2.ballflag as ballflag from ( select tmp1.match_id as match_id, concat(tmp1.asia_front,,,tmp1.asia_yapan,,,tmp1.asia_back,,,tmp1.goal_big,,,tmp1.goal_pan,,,tmp1.goal_small,,,tmp1.ballflag,,,tmp1.home_assign_label,,,tmp1.away_assign_label) as feature, tmp1.ballflag as ballflag, tmp1.time as time, row_number() OVER(partition BY tmp1.match_id ORDER BY tmp1.time) row_id from ( select tmp.match_id as match_id, tmp.asia_odds_date as asia_odds_date, round(if((tmp.asia_front<=0.25),0.00,(if((tmp.asia_front>=2),1.00,(tmp.asia_front-0.25)/1.75))),2) as asia_front, round(if((tmp.asia_yapan<=0),0,(if((tmp.asia_yapan>=6),1,(tmp.asia_yapan-0)/6))),2) as asia_yapan, round(if((tmp.asia_back<=0.35),0,(if((tmp.asia_back>=2),1,(tmp.asia_back-0.35)/1.65))),2) as asia_back, if((tmp.flag==0),1,0) as home_assign_label, if((tmp.flag==1),1,0) as away_assign_label, round(if((tmp.goal_big<=0.5),0,(if((tmp.goal_big>=3.5),1,(tmp.goal_big-0.5)/3))),2) as goal_big, round(if((tmp.goal_pan<=0.5),0,(if((tmp.goal_pan>=6.5),1,(tmp.goal_pan-0.5)/6))),2) as goal_pan, round(if((tmp.goal_small<=0.1),0,(if((tmp.goal_small>=1.2),1,(tmp.goal_small-0.1)/1.1))),2) as goal_small, tmp.tt_time as time, tmp.ballflag as ballflag, row_number() OVER(partition BY tmp.match_id ORDER BY tmp.asia_time) row_id from ( select mid.match_id as match_id, asia.asia_odds_date as asia_odds_date, cast(asia.asia_front as float) as asia_front, if((asia.flag like %受让%),1,0) AS flag, cast(asia.asia_yapan as float) as asia_yapan, cast(asia.asia_back as float) as asia_back, asia.asia_odds as asia_odds, asia.asia_time as asia_time, goal.goal_odds_date as goal_odds_date, cast(goal.goal_big as float) as goal_big, cast(goal.goal_pan as float) as goal_pan, cast(goal.goal_small as float) as goal_small, goal.goal_odds as goal_odds, goal.goal_time as goal_time, tt.flag as ballflag, tt.time as tt_time, row_number() OVER(partition BY mid.match_id,asia.asia_time,goal.goal_time,tt.time ORDER BY asia.asia_odds_date) row_id from (select tasia.match_id as match_id, tasia.asia_odds_date as asia_odds_date, split(tasia.asia_odds,,)[0] as asia_front, split(tasia.asia_odds,,)[1] as flag, CASE split(tasia.asia_odds,,)[1] when 平手/半球 then 0.25 when 平手 then 0 when 半球 then 0.5 when 半球/一球 then 0.75 when 受让平手/半球 then 0.75 when 一球 then 1.0 when 受让半球 then 0.5 when 一球/球半 then 1.25 when 受让半球/一球 then 0.75 when 球半 then 1.5 when 受让一球 then 1.0 when 球半/两球 then 1.75 when 受让一球/球半 then 1.25 when 两球 then 2.0 when 受让球半 then 1.5 when 受让球半/两球 then 1.75 when 两球/两球半 then 2.25 when 两球半 then 2.5 when 受让两球 then 2.0 when 两球半/三球 then 2.75 when 受让两球/两球半 then 2.25 when 受让两球半 then 2.5 when 三球 then 3.0 when 受让两球半/三球 then 2.75 when 受让三球 then 3.0 when 三球/三球半 then 3.25 when 三球半 then 3.5 when 三球半/四球 then 3.75 when 受让三球/三球半 then 3.25 when 受让三球半 then 3.5 when 四球 then 4.0 when 受让三球半/四球 then 3.75 when 受让四球 then 4.0 when 四球半 then 4.5 when 四球/四球半 then 4.25 when 受让四球半 then 4.5 when 四球半/五球 then 4.75 when 受让四球/四球半 then 4.25 when 五球 then 5.0 when 受让四球半/五球 then 4.75 when 受让五球 then 5.0 when 五球/五球半 then 5.25 when 受让五球半 then 5.5 when 五球半 then 5.5 when 五球半/六球 then 5.75 when 受让五球/五球半 then 5.25 when 受让五球半/六球 then 5.75 when 六球 then 6.0 when 受让六球 then 6.0 when 受让六球半 then 6.5 when 受让六球/六球半 then 6.25 when 六球半 then 6.5 when 六球/六球半 then 6.25 when 受让七球 then 7.0 when 受让七球/七球半 then 7.25 when 七球 then 7.0 when 六球半/七球 then 6.75 when 七球半 then 7.5 when 七球/七球半 then 7.25 when 受让六球半/七球 then 6.75 when 受让七球半 then 7.5 when 受让七球半/八球 then 7.75 when 八球半 then 8.5 when 七球半/八球 then 7.75 when 八球半/九球 then 8.75 when 受让八球 then 8.0 when 九球半/十球 then 9.75 when 九球/九球半 then 9.25 when 九球 then 9.0 when 受让九球 then 9.0 when 受让九球/九球半 then 9.25 when 八球/八球半 then 8.25 when 八球 then 8.0 when 13 then 13 when 10.75 then 10.75 when 受让八球半 then 8.5 when 十球 then 10 when 11.75 then 11.75 when 九球半 then 9.5 when 受让九球半 then 9.5 when 受让八球半/九球 then 8.75 when 11.5 then 11.5 when 受让九球半/十球 then 9.75 when 10.5 then 10.5 else 20 END AS asia_yapan, split(tasia.asia_odds,,)[2] as asia_back, tasia.asia_odds as asia_odds, tasia.asia_time as asia_time from (select match_id as match_id, odds_date as asia_odds_date, regexp_replace(regexp_replace(odds, \\}, ), \\{, ) AS asia_odds, time as asia_time, row_number() OVER(partition BY ft_all_odds.match_id,ft_all_odds.time ORDER BY ft_all_odds.sort) row_id from ft_all_odds where handicap_type=1 and tag=滚 and company_id=3 and odds not like %封% and handicap_num=1 and `time` !=中场)tasia where tasia.row_id=1 )asia , (select tgoal.match_id as match_id, tgoal.goal_odds_date as goal_odds_date, split(tgoal.goal_odds,,)[0] as goal_big, split(split(tgoal.goal_odds,,)[1],/)[0] as goal_pan, split(tgoal.goal_odds,,)[2] as goal_small, tgoal.goal_odds as goal_odds, tgoal.goal_time as goal_time from (select match_id as match_id, odds_date as goal_odds_date, regexp_replace(regexp_replace(odds, \\}, ), \\{, ) AS goal_odds, time as goal_time, row_number() OVER(partition BY ft_all_odds.match_id,ft_all_odds.time ORDER BY ft_all_odds.sort) row_id from ft_all_odds where handicap_type=3 and tag=滚 and company_id=3 and handicap_num=1 and odds not like %封% and `time` !=中场)tgoal where tgoal.row_id=1 )goal , (select match_id as match_id from ( select t1.match_id match_id, max(t1.total_score) as max_score from ( select match_id, split(score,-)[0]+split(score,-)[1] as total_score from ft_all_odds where `time` !=中场 and `time` != and tag=滚 and company_id=3 and handicap_num=1 and from_unixtime(unix_timestamp(odds_date),yyyy-MM-dd HH:mm:ss) BETWEEN 2019-08-01 AND current_date and time>60)t1 group by match_id)t2 where t2.max_score>0)mid, (select t2.match_id as match_id, if((t2.max_flag==1),(if((t2.rowId==1),1,0)),0) as flag, t2.score as score, t2.time as time, ROW_NUMBER() OVER (partition BY t2.match_id ORDER BY t2.time) row_id from (select t1.match_id as match_id, t1.score as score, t1.time as time, if(t1.score==t.max_score,0,1) as max_flag, NTILE(3) OVER (partition BY t1.match_id,t1.score ORDER BY t1.time desc) rowId from (select match_id, split(score,-)[0]+split(score,-)[1] as score, CAST(time AS INT) as time from ft_t_odds where tag=滚 and company_id=3 and handicap_num=1 and `time` !=中场 )t1, (select tmp.match_id, max(tmp.score) as max_score from (select match_id, cast(split(score,-)[0]+split(score,-)[1] as INT) as score from ft_t_odds where tag=滚 and company_id=3 and handicap_num=1 and `time` !=中场 )tmp group by match_id)t where t1.match_id=t.match_id )t2)tt where asia.match_id=goal.match_id and goal.match_id=mid.match_id and tt.match_id=goal.match_id and asia.asia_time=goal.goal_time and tt.time=goal.goal_time and asia.asia_yapan != 20)tmp where tmp.row_id=1)tmp1)tmp2 )tmp3 where tmp3.feature1 is not null and tmp3.feature2 is not null and tmp3.feature3 is not null and tmp3.feature4 is not null and tmp3.feature1 is not null

4.2 数据处理 (data_processing.py)


# __encoding__ = "utf-8" import numpy as np import pandas as pd import matplotlib.pyplot as plt import seaborn as sbn sbn.set(style="whitegrid", color_codes=True) # 设置绘图风格 plt.rcParams[font.family] = [sans-serif] plt.rcParams[font.sans-serif] = [SimHei] # 正常显示中文标签 plt.rcParams[axes.unicode_minus] = False plt.rcParams[figure.figsize] = (15.0, 10.0) # 设置图形大小 plt.rcParams[savefig.dpi] = 200 # 图片像素 plt.rcParams[figure.dpi] = 200 # 分辨率 import os # 1: show all information;2:only show warning and error; 3: only show error os.environ[TF_CPP_MIN_LOG_LEVEL] = 2 ######################################################### # 1. read data ######################################################### odds_data = pd.read_csv("./data/yijinqiu_Q.csv") odds_data["match_id"] = odds_data["match_id"].apply(lambda x: A_ + str(x)) print(odds_data.shape) print(odds_data.head(10)) print(odds_data.columns) print(odds_data.dtypes) ######################################################### # 2. data preprocessing ######################################################### # 2.1 exclude anomalous data def check_feature_special_values(df, feature_cols): """ check every feature ("" and "封") :param df: data frame :param feature_cols: check feature columns :return: whether feature columns include special values """ res = {} for col in feature_cols: col_data = df[col] special_label = col_data.apply(lambda x: 1 if ((x == "") | (x == "封")) else 0) special_sum = sum(special_label) res[col] = special_sum return res feature_cols = ["asia_front", "asia_yapan", "asia_back", "goal_big", "goal_pan", "goal_small"] special_stats = check_feature_special_values(odds_data, feature_cols) print(special_stats) # 2.1.1 abnormal feature value odds_data_copy = odds_data[odds_data["goal_pan"] != "封"] special_stats1 = check_feature_special_values(odds_data_copy, feature_cols) print(special_stats1) print(odds_data_copy.shape) # 2.1.2 abnormal record number of match # 对应不同记录条数的比赛统计 record_num_stats = odds_data.groupby(["match_id"])["asia_front"].count().reset_index() record_num_stats.rename(columns={"asia_front": "record_num"}, inplace=True) print(record_num_stats.head(10)) record_match_stats = record_num_stats.groupby(["record_num"])["match_id"].count().reset_index() record_match_stats.rename(columns={"match_id":"match_num"}, inplace=True) print(record_match_stats.head()) # record_match_stats.to_csv("record_match_stats.csv" encoding="utf-8" index=False) # 筛选19 < 记录数 <= 90 select_match = record_num_stats[(19<record_num_stats["record_num"]) & (record_num_stats["record_num"]<=90)] select_match.drop(columns="record_num", inplace=True) odds_data_final = odds_data_copy.merge(select_match, how="inner", on="match_id") print(odds_data_final.head()) # print(odds_data_final.tail()) # 2.2 feature transform # one-hot encode for "flag" odds_data_final["home_assign_label"] = odds_data_final["flag"].apply(lambda x: 1 if x==0 else 0) odds_data_final["away_assign_label"] = odds_data_final["flag"].apply(lambda x: 1 if x==1 else 0) # feature type tranform lst = ["goal_big", "goal_pan", "goal_small"] def type_convert(df, col_list): for col in col_list: tmp_data = df[col].apply(lambda x: np.float64(x)) df[col] = tmp_data return df odds_data_final = type_convert(odds_data_final, lst) print(odds_data_final.dtypes) ######################################################### # 3. feature explore analysis ######################################################### # 3.1 feature visualization def continuous_feature_plot(df, hist_feature_list, n_bins=50, font_size=14, target=None): """ The histgram and kernel density gram of continuous features. If target column is not null we will show grouped histgram and grouped boxplot at the same time. :param hist_feature_list: continuous feature list. :param n_bins: bin number default 50 bins. :param font_size: font size,default 14. :param target: target column :return: """ for col in hist_feature_list: print("连续特征:", col) if target is not None: fig, axes = plt.subplots(nrows=2, ncols=2, sharex=True, sharey=True, figsize=(16, 8), facecolor="gray") # histgram plt.subplot(221) sbn.distplot(df[col]) plt.tight_layout() plt.xlabel(col, fontdict={weight: normal, size: font_size}) plt.title("{col} -- histgram".format(col=col), fontdict={weight: normal, size: font_size}) # 改变标题文字大小 # violin(percentile) plt.subplot(222) sbn.violinplot(x=col, data=df, palette="Set2", split=True, scale="area", inner="quartile") plt.tight_layout() plt.xlabel(col, fontdict={weight: normal, size: font_size}) plt.title("{col} -- violin plot".format(col=col), fontdict={weight: normal, size: font_size}) print("进行分组可视化......") unique_vals = df[target].unique().tolist() unique_val0 = df[df[target] == unique_vals[0]] unique_val1 = df[df[target] == unique_vals[1]] # grouped histgram plt.subplot(223) sbn.distplot(unique_val0[col], bins=n_bins, kde=False, norm_hist=True, color=steelblue, label=str(unique_vals[0])) sbn.distplot(unique_val1[col], bins=n_bins, kde=False, norm_hist=True, color=purple, label=str(unique_vals[1])) plt.tight_layout() plt.xlabel(col, fontdict={weight: normal, size: font_size}) plt.legend() plt.title("{col} -- grouped histgram ".format(col=col), fontdict={weight: normal, size: font_size}) # grouped kernel density diagram plt.subplot(224) sbn.distplot(unique_val0[col], hist=False, kde_kws={"color": "red", "linestyle": "-"}, norm_hist=True, label=str(unique_vals[0])) sbn.distplot(unique_val1[col], hist=False, kde_kws={"color": "black", "linestyle": "--"}, norm_hist=True, label=str(unique_vals[1])) plt.tight_layout() plt.xlabel(col, fontdict={weight: normal, size: font_size}) plt.legend() plt.title("{col} -- grouped kernel density diagram".format(col=col), fontdict={weight: normal, size: font_size}) """ 分组箱线图 """ # plt.subplot(222) # sns.boxplot(x=[unique_val0[col] unique_val1[col]] labels=[unique_vals[0] unique_vals[1]]) # plt.xlabel(col fontdict={weight:normal size: font_size}) # plt.title("{col}特征的分组箱线图".format(col=col) fontdict={weight:normal size: font_size}) else: fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(16, 8), facecolor="gray") # 直方图 plt.subplot(121) sbn.distplot(df[col]) plt.tight_layout() plt.xlabel(col, fontdict={weight: normal, size: font_size}) plt.title("{col} -- histgram ".format(col=col), fontdict={weight: normal, size: font_size}) # 改变标题文字大小 # 小提琴图(分位数) plt.subplot(122) sbn.violinplot(x=col, data=df, palette="Set2", split=True, scale="area", inner="quartile") plt.tight_layout() plt.xlabel(col, fontdict={weight: normal, size: font_size}) plt.title("{col} -- violin plot".format(col=col), fontdict={weight: normal, size: font_size}) plt.savefig("{col} -- histgram & boxplot.png".format(col=col)) plt.show() print(df[col].describe()) continuous_feature_plot(odds_data_final, hist_feature_list=feature_cols, n_bins=50, font_size=14, target="ballflag") # 3.2 feature values statistics def feature_stat(df, feature_list, target, feature_type="numeric"): """ :param df: data frame :param feature_list: feature list :param target: target column :param feature_type: feature type default as numeric. if factor feature,we will choose object by percent descending arrange; if numeric feature,we will choose float64 and int64 by feature value ascending arrange. :return: """ df_stat = pd.DataFrame(columns=["value", "count", "pct", "feature"]) for col in feature_list: if col == target: continue else: n = len(df) col_stat = pd.DataFrame(df.groupby(col)[target].count()) col_stat.reset_index(level=0, inplace=True) col_stat.rename(columns={col: "value", target: "count"}, inplace=True) n_value = len(col_stat) col_stat["pct"] = col_stat.apply(lambda x: x[1] / n, axis=1) col_stat["feature"] = [col for i in range(n_value)] if feature_type == "object": col_stat = col_stat.sort_values(by="pct", axis=0, ascending=False) else: col_stat = col_stat.sort_values("value", axis=0, ascending=True) df_stat = df_stat.append(col_stat, ignore_index=True) return df_stat feature_value_stat = feature_stat(odds_data_final, feature_list=feature_cols, target="ballflag", feature_type="numeric") print(feature_value_stat.head()) # save the feature value statistics result # feature_value_stat.to_csv("feature_value_stat.csv", encoding="utf-8", index=False) ######################################################### # 4. feature engineering ######################################################### # 4.1 capping (mu - 3*sigma mu + 3*sigma) def feature_capping_value(df, feature_cols, confidence_param=2): max_value_dict = {} min_value_dict = {} for col in feature_cols: col_data = list(df[col]) mu = np.mean(col_data) sigma = np.std(col_data) col_max_value = round(mu + confidence_param * sigma, 2) col_min_value = round(mu - confidence_param * sigma, 2) max_value_dict[col] = col_max_value min_value_dict[col] = col_min_value return min_value_dict, max_value_dict # min_feature_dict max_feature_dict = feature_capping_value(odds_data_final, feature_cols=feature_cols) # print(min_feature_dict) # print(max_feature_dict) # 4.2 feature percentile truncation def percentile_truncation(df, feature_cols, percentile_param=[0.01, 0.99]): max_value_dict = {} min_value_dict = {} for col in feature_cols: col_data = list(df[col]) col_min_value = round(np.percentile(col_data, percentile_param[0]), 2) col_max_value = round(np.percentile(col_data, percentile_param[1]), 2) max_value_dict[col] = col_max_value min_value_dict[col] = col_min_value return min_value_dict, max_value_dict # min_feature_dict max_feature_dict = percentile_truncation(odds_data_final, feature_cols=feature_cols) # print(min_feature_dict) # print(max_feature_dict) # ******************************************* # feature normalization min_feature_dict = {"asia_front": 0.25, "asia_yapan": 0.0, "asia_back": 0.35, "goal_big": 0.5, "goal_pan": 0.5, "goal_small": 0.1} max_feature_dict = {"asia_front": 2.0, "asia_yapan": 6.0, "asia_back": 2.0, "goal_big": 3.5, "goal_pan": 6.5, "goal_small": 1.2} def feature_normalize(df, min_value_dict, max_value_dict): """ :param df: data frame :param max_value_dict: Maximum value dictionary of features :param min_value_dict: Minimum value dictionary of features :return: The features after normalization """ n_length = df.shape[0] feature_cols = list(max_value_dict.keys()) for col in feature_cols: col_data = list(df[col]) col_max_value = max_value_dict[col] col_min_value = min_value_dict[col] for i in range(n_length): if col_data[i] <= col_min_value: col_data[i] = 0.0 elif col_data[i] >= col_max_value: col_data[i] = 1.0 else: col_data[i] = round((col_data[i] - col_min_value) / (col_max_value - col_min_value), 2) df[col + "_normalize"] = col_data df.drop(feature_cols, inplace=True) return df odds_normalize_df = feature_normalize(odds_data_final, min_feature_dict, max_feature_dict) # check feature range after normalization odds_normalize_df.describe().astype("float64") # save the normalized odds data odds_normalize_df.to_csv("odds_normalization_data.csv", encoding="utf-8", index=False)

4.3 模型辅助函数

# __encoding__ = "utf-8" import numpy as np import pandas as pd from sklearn.model_selection import train_test_split from keras import Sequential from keras.layers import * from keras.callbacks import EarlyStopping, ModelCheckpoint, TensorBoard from keras.utils import to_categorical, plot_model from keras.models import load_model def sample_extract(df, target_col, frac_0=0.1, frac_1=0.2, random_state=1234): """ :param df: data frame :param target_col: target column :param frac_0: sample fraction for label equals 0 :param frac_1: sample fraction for label equals 1 :return: sample data extracted """ # df.drop(["match_id"], axis=1, inplace=True) easy_score0 = df[df[target_col] == 0] easy_score1 = df[df[target_col] == 1] easy_score_noteasy = easy_score0.sample(frac=frac_0, replace=False, random_state=random_state) print(easy_score_noteasy.shape) easy_score_easy = easy_score1.sample(frac=frac_1, replace=False, random_state=random_state) print(easy_score_easy.shape) easy_score_extract = pd.concat([easy_score_easy, easy_score_noteasy], axis=0) print(easy_score_extract.shape) return easy_score_extract def reshape_and_split_data(df, feature_col, target_col, time_steps, num_features, n_classes=2, sample_frac=0.8, random_state=1234): """ :param df: data frame :param feature_col: feature column :param target_col: target column :param time_steps: time steps :param num_features: The number of features :param n_classes: The number of classes :param sample_frac: sample fraction :return: convert data frame to array """ # feature reshape feature_data = df[feature_col] all_data = [] n_samples = feature_data.shape[0] index_list = list(feature_data.index) for i in range(n_samples): idx_value = index_list[i] tmp = np.array(feature_data.loc[idx_value].split(","), dtype=np.float64) tmp_array = tmp.reshape((time_steps, num_features)) all_data.append(tmp_array) X = np.array(all_data).reshape(n_samples, time_steps, num_features) # label y_tmp = to_categorical(df[target_col], num_classes=n_classes) y = y_tmp.reshape(n_samples, n_classes) X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=sample_frac, random_state=random_state) return X_train, X_test, y_train, y_test def lstm_model(num_classes, time_steps, num_features): model = Sequential() model.add(LSTM(32, activation="relu", return_sequences=True, input_shape=(time_steps, num_features))) model.add(LSTM(16, activation=relu, dropout=0.5, recurrent_dropout=0.5)) model.add(Dense(num_classes, activation=softmax)) model.compile(optimizer=adam, loss=categorical_crossentropy, metrics=[accuracy]) print("Lstm model summary:\n ", model.summary()) return model def lstm_model_train(X_train, X_test, y_train, y_test, time_steps, num_features, checkpoint_path, epochs=20, batch_size=10000, whether_earlystop=0): """ :param time_steps: time steps :param num_features: The number of features :param checkpoint_path: The filepath of checkpoint :param epochs: The epoch times :param batch_size: The batch size :param whether_earlystop: Whether to early stop. If it equals 0, then False, else True. """ lstm_train_model = lstm_model(num_classes=2, time_steps=time_steps, num_features=num_features) plot_model(lstm_train_model, to_file="lstm_model.png", show_shapes=True, show_layer_names=True) # initialize the model parameters epochs, batch_size = epochs, batch_size print("Beginning to model training ......\n") if whether_earlystop == 1: lstm_history = lstm_train_model.fit(X_train, y_train, epochs=epochs, batch_size=batch_size, validation_split=0.2, callbacks=[EarlyStopping(monitor=val_accuracy, patience=3, min_delta=0.0001), ModelCheckpoint(checkpoint_path, monitor="val_accuracy", verbose=1, save_best_only=True, mode="max")], verbose=1) else: lstm_history = lstm_train_model.fit(X_train, y_train, epochs=epochs, batch_size=batch_size, validation_split=0.2, callbacks=[ModelCheckpoint(checkpoint_path, monitor="val_accuracy", verbose=1, save_best_only=True, mode="max")], verbose=1) print("model evaluate on test data:\n") lstm_model_evaluate = lstm_train_model.evaluate(X_test, y_test) print(测试集 Loss: {:0.4f}\n Accuracy: {:0.4f}.format(lstm_model_evaluate[0], lstm_model_evaluate[1])) def model_load_and_predict(model_path, unknown_data): """ load model and predict for unknown data :param model_path: The filepath of model :param unknown_data: unknown data, data structure like (n_samples, 5, 8) :return: """ final_model = load_model(model_path) pred_result = final_model.predict(unknown_data) print("prediction result:\n", pred_result) return pred_result

4.4 模型训练

# __encoding__ = "utf-8" import pandas as pd from model_helper_udf import * if __name__ == "__main__": # 1.read data easy_score_data = pd.read_csv("./data/easy_score_model_train_data.csv") print(easy_score_data.shape) print(easy_score_data.head()) print(easy_score_data.columns) print(easy_score_data.dtypes) # 2."ballfalg" statistic ballflag_stat = easy_score_data.groupby(["ballflag"]).agg({"match_id": count}) print(ballflag_stat) ballflag_pct = ballflag_stat.apply(lambda x: x / (x.loc[0] + x.loc[1])).rename(columns={"match_id": "pct"}) print(ballflag_pct) # 3.sample extract # 全量数据训练 # extract_easy_score = sample_extract(easy_score_data, target_col="ballflag", frac_0=1, frac_1=1) # 比例全量数据训练 # extract_easy_score = sample_extract(easy_score_data, target_col="ballflag", frac_0=0.5, frac_1=0.5) # 全量均衡数据训练 extract_easy_score = sample_extract(easy_score_data, target_col="ballflag", frac_0=0.4, frac_1=1) # 比例均衡数据训练 # extract_easy_score = sample_extract(easy_score_data, target_col="ballflag", frac_0=0.2, frac_1=0.5) # 4.reshape and split data Time_steps, N_features = 5, 8 X_train, X_test, y_train, y_test = reshape_and_split_data(extract_easy_score, "feature", "ballflag", time_steps=Time_steps, num_features=N_features) print("The shape of train data: ", X_train.shape) print("The shape of test shape: ", X_test.shape) # 5.model training lstm_model_train(X_train=X_train, X_test=X_test, y_train=y_train, y_test=y_test, time_steps=Time_steps, num_features=N_features, checkpoint_path="val_accuracy-improvement-{epoch:02d}--{val_accuracy:.4f}.h5", epochs=20, batch_size=10000, whether_earlystop=0)

4.5 模型部署


# __encoding__ = "utf-8" import numpy as np from keras.models import load_model import datetime # model path final_model_path = "easy_score_model.h5" # forecast data pred_data = [[0.35, 0.08, 0.39, 0.13, 0.17, 0.76, 1.00, 0.00], [0.38, 0.08, 0.36, 0.15, 0.17, 0.73, 1.00, 0.00], [0.37, 0.08, 0.38, 0.16, 0.17, 0.68, 1.00, 0.00], [0.41, 0.08, 0.33, 0.18, 0.17, 0.63, 1.00, 0.00], [0.42, 0.08, 0.32, 0.19, 0.17, 0.60, 1.00, 0.00]] pred_data = np.array(pred_data, dtype="float64").reshape(1, 5, 8) print("The structure of predicted data:\n", type(pred_data)) print("The shape of predicted data:\n", pred_data.shape) # load model easy_score_final_model = load_model(final_model_path) # model predict t1 = datetime.datetime.now() pred_result = easy_score_final_model.predict(pred_data) t2 = datetime.datetime.now() print("t1:\n ", t1) print("t2:\n ", t2) print("model predict one sample consume time:\n ", t2 -t1) print("The structure of model prediction result:\n", type(pred_result)) print("model prediction result:\n", pred_result) print("The structure of model prediction result:\n", pred_result.shape)








