# 推荐算法_CIKM-2019-AnalytiCup 冠军源码解读_2

1、generate_static_features.ipynb 标题简洁明了 提取静态特征

import pandas as pd
import numpy as np

def reduce_mem_usage(df):
""" iterate through all the columns of a dataframe and modify the data type
to reduce memory usage.
"""
start_mem = df.memory_usage().sum()
print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))

for col in df.columns:
col_type = df[col].dtype

if col_type != object:
c_min = df[col].min()
c_max = df[col].max()
if str(col_type)[:3] == 'int':
if c_min > np.iinfo(np.int8).min and c_max  np.iinfo(np.int16).min and c_max  np.iinfo(np.int32).min and c_max  np.iinfo(np.int64).min and c_max  np.finfo(np.float16).min and c_max  np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
df[col] = df[col].astype(np.float32)
else:
df[col] = df[col].astype(np.float64)
else:
df[col] = df[col].astype('category')

end_mem = df.memory_usage().sum()
print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))

return df

data.columns = ['userID','itemID','behavior','timestamp']
data['day'] = data['timestamp'] // 86400
data['hour'] = data['timestamp'] // 3600 % 24

## 生成behavior的onehot
data[i] = 0
data.loc[data['behavior'] == i, i] = 1

## 生成behavior的加权

data['day_hour'] = data['day'] + data['hour'] / float(24)
data.loc[data['behavior']=='pv','behavior'] = 1
data.loc[data['behavior']=='fav','behavior'] = 2
data.loc[data['behavior']=='cart','behavior'] = 3
max_day = max(data['day'])
min_day = min(data['day'])
data['behavior'] = (1 - (max_day-data['day_hour']+2)/(max_day-min_day+2)) * data['behavior']

item.columns = ['itemID','category','shop','brand']
user.columns = ['userID','sex','age','ability']

data = reduce_mem_usage(data)

data = pd.merge(left=data, right=item, on='itemID',how='left')
data = pd.merge(left=data, right=user, on='userID',how='left')

return user, item, data



path = '../ECommAI_EUIR_round2_train_20190816/'

user, item, data = load_data(path = path)

for count_feature in ['itemID', 'shop', 'category','brand']:
data[['behavior', count_feature]].groupby(count_feature, as_index=False).agg(
{'behavior':'count'}).rename(columns={'behavior':count_feature + '_count'}).to_csv(str(count_feature)+'_count.csv', index=False)

for count_feature in ['itemID', 'shop', 'category','brand']:
data[['behavior', count_feature]].groupby(count_feature, as_index=False).agg(
{'behavior':'sum'}).rename(columns={'behavior':count_feature + '_sum'}).to_csv(str(count_feature)+'_sum.csv', index=False)


temp = data[['behavior','category']].groupby('category', as_index=False).agg({'behavior': ['median','std','skew']})
temp.columns = ['category','category_median','category_std','category_skew']

temp.to_csv('category_higher.csv',index=False)

temp = data[['behavior','itemID']].groupby('itemID', as_index=False).agg({'behavior': ['median','std','skew']})
temp.columns = ['itemID','itemID_median','itemID_std','itemID_skew']

temp.to_csv('itemID_higher.csv',index=False)


data['age'] = data['age'] // 10
train = data[data['day'] < 15]

for count_feature in ['sex','ability','age']:
data[['behavior','itemID',count_feature]].groupby(['itemID', count_feature], as_index=False).agg(
{'behavior': 'count'}).rename(columns={'behavior':'user_to_'
+ count_feature + '_count'}).to_csv('item_to_' + str(count_feature)+'_count_online.csv', index=False)


itemcount = pd.read_csv('itemID_count.csv')

temp = pd.merge(left=item, right=itemcount, how='left', on='itemID')

item_rank = []
for eachcat in temp.groupby('category'):
each_df = eachcat[1].sort_values('itemID_count', ascending=False).reset_index(drop=True)
each_df['rank'] = each_df.index + 1
lenth = each_df.shape[0]
each_df['rank_percent'] = (each_df.index + 1) / lenth
item_rank.append(each_df[['itemID','rank','rank_percent']])


item_rank = pd.concat(item_rank, sort=False)

item_rank.to_csv('item_rank.csv',index=False)


def unique_count(x):
return len(set(x))

cat1 = item.groupby('category',as_index=False).agg({'itemID': unique_count}).rename(columns={'itemID':'itemnum_undercat'})

cat2 = item.groupby('category',as_index=False).agg({'brand': unique_count}).rename(columns={'brand':'brandnum_undercat'})

cat3 = item.groupby('category',as_index=False).agg({'shop': unique_count}).rename(columns={'shop':'shopnum_undercat'})

pd.concat([cat1, cat2[['brandnum_undercat']], cat3[['shopnum_undercat']]], axis=1).to_csv('category_lower.csv',index=False)


2、generate_dynamic_feature.ipynb  提取动态特征

import pandas as pd
import numpy as np

def reduce_mem_usage(df):
""" iterate through all the columns of a dataframe and modify the data type
to reduce memory usage.
"""
start_mem = df.memory_usage().sum()
print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))

for col in df.columns:
col_type = df[col].dtype

if col_type != object:
c_min = df[col].min()
c_max = df[col].max()
if str(col_type)[:3] == 'int':
if c_min > np.iinfo(np.int8).min and c_max  np.iinfo(np.int16).min and c_max  np.iinfo(np.int32).min and c_max  np.iinfo(np.int64).min and c_max  np.finfo(np.float16).min and c_max  np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
df[col] = df[col].astype(np.float32)
else:
df[col] = df[col].astype(np.float64)
else:
df[col] = df[col].astype('category')

end_mem = df.memory_usage().sum()
print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))

return df

data.columns = ['userID','itemID','behavior','timestamp']
data['day'] = data['timestamp'] // 86400
data['hour'] = data['timestamp'] // 3600 % 24

## 生成behavior的onehot
data[i] = 0
data.loc[data['behavior'] == i, i] = 1

## 生成behavior的加权

data['day_hour'] = data['day'] + data['hour'] / float(24)
data.loc[data['behavior']=='pv','behavior'] = 1
data.loc[data['behavior']=='fav','behavior'] = 2
data.loc[data['behavior']=='cart','behavior'] = 3
max_day = max(data['day'])
min_day = min(data['day'])
data['behavior'] = (1 - (max_day-data['day_hour']+2)/(max_day-min_day+2)) * data['behavior']

item.columns = ['itemID','category','shop','brand']
user.columns = ['userID','sex','age','ability']

data = reduce_mem_usage(data)

data = pd.merge(left=data, right=item, on='itemID',how='left')
data = pd.merge(left=data, right=user, on='userID',how='left')

return user, item, data



#path = '..\\data\\'
path = '../ECommAI_EUIR_round2_train_20190816/'
user, item, data = load_data(path = path)

train = data[data['day'] < 15]

online_features = []
for count_feature in ['category','shop','brand']:
train[['behavior','userID',count_feature]].groupby(['userID', count_feature], as_index=False).agg(
{'behavior': 'count'}).rename(columns={'behavior':'user_to_'
+ count_feature + '_count'}).to_csv('user_to_' + str(count_feature)+'_count.csv', index=False)
for count_feature in ['category','shop','brand']:
train[['behavior','userID',count_feature]].groupby(['userID', count_feature], as_index=False).agg(
{'behavior': 'sum'}).rename(columns={'behavior':'user_to_'
+ count_feature + '_sum'}).to_csv('user_to_' + str(count_feature)+'_sum.csv', index=False)

for count_feature in ['category','shop','brand']:
train[[behavior_type,'userID',count_feature]].groupby(['userID', count_feature], as_index=False).agg(
{behavior_type: 'sum'}).rename(columns={behavior_type:'user_to_'
+ count_feature + '_count_' + behavior_type}).to_csv('user_to_' + str(count_feature) + '_count_' + behavior_type + '.csv', index=False)


yestday = data[data['day'] == 14]

for count_feature in ['category','shop','brand']:
yestday[['behavior','userID',count_feature]].groupby(['userID', count_feature], as_index=False).agg(
{'behavior': 'count'}).rename(columns={'behavior':'user_to_'
+ count_feature + '_count_yestday'}).to_csv('user_to_' + str(count_feature)+'_count_yestday.csv', index=False)

for count_feature in ['category','shop','brand']:
yestday[[behavior_type,'userID',count_feature]].groupby(['userID', count_feature], as_index=False).agg(
{behavior_type: 'sum'}).rename(columns={behavior_type:'user_to_'
+ count_feature + '_count_' + behavior_type+'_yestday'}).to_csv('user_to_' + str(count_feature) + '_count_' + behavior_type + '_yestday.csv', index=False)


a5days = data[(data['day'] > 15 - 5) & (data['day'] < 15)]

for count_feature in ['category','shop','brand']:
a5days[['behavior','userID',count_feature]].groupby(['userID', count_feature], as_index=False).agg(
{'behavior': 'count'}).rename(columns={'behavior':'user_to_'
+ count_feature + '_count_5days'}).to_csv('user_to_' + str(count_feature)+'_count_5days.csv', index=False)

for count_feature in ['category','shop','brand']:
a5days[[behavior_type,'userID',count_feature]].groupby(['userID', count_feature], as_index=False).agg(
{behavior_type: 'sum'}).rename(columns={behavior_type:'user_to_'
+ count_feature + '_count_' + behavior_type+'_5days'}).to_csv('user_to_' + str(count_feature) + '_count_' + behavior_type + '_5days.csv', index=False)


start_timestamp  = max(data[data['day'] < 15]['timestamp'])

time_features = []
test = data[data['day'] < 15]
for time_feature in ['shop', 'category','brand']:
time_features.append(test[['last_time','userID',time_feature,'day']].groupby(['userID',time_feature], as_index=False).agg({'last_time': 'min', 'day':'max'}).rename(columns={'last_time': 'user_to_'+ time_feature + '_lasttime', 'day':'user_to_'+ time_feature + '_lastday'}))

for f in time_features:
f.to_csv(str(f.columns[2])+'.csv', index=False)

for f in time_features:
print(str(f.columns[2])+'.csv')


for count_feature in ['sex','ability','age']:
train[['behavior','itemID',count_feature]].groupby(['itemID', count_feature], as_index=False).agg(
{'behavior': 'count'}).rename(columns={'behavior':'user_to_'+ count_feature + '_count'}).to_csv('item_to_' + str(count_feature)+'_count.csv', index=False)


3、generate_time_feature.ipynb 提取时间特征

def reduce_mem_usage(df):
""" iterate through all the columns of a dataframe and modify the data type
to reduce memory usage.
"""
start_mem = df.memory_usage().sum()
print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))

for col in df.columns:
col_type = df[col].dtype

if col_type != object:
c_min = df[col].min()
c_max = df[col].max()
if str(col_type)[:3] == 'int':
if c_min > np.iinfo(np.int8).min and c_max  np.iinfo(np.int16).min and c_max  np.iinfo(np.int32).min and c_max  np.iinfo(np.int64).min and c_max  np.finfo(np.float16).min and c_max  np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
df[col] = df[col].astype(np.float32)
else:
df[col] = df[col].astype(np.float64)
else:
df[col] = df[col].astype('category')

end_mem = df.memory_usage().sum()
print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))

return df

data.columns = ['userID','itemID','behavior','timestamp']
data['day'] = data['timestamp'] // 86400
data['hour'] = data['timestamp'] // 3600 % 24

## 生成behavior的onehot
data[i] = 0
data.loc[data['behavior'] == i, i] = 1

## 生成behavior的加权

data['day_hour'] = data['day'] + data['hour'] / float(24)
data.loc[data['behavior']=='pv','behavior'] = 1
data.loc[data['behavior']=='fav','behavior'] = 2
data.loc[data['behavior']=='cart','behavior'] = 3
max_day = max(data['day'])
min_day = min(data['day'])
data['behavior'] = (1 - (max_day-data['day_hour']+2)/(max_day-min_day+2)) * data['behavior']

item.columns = ['itemID','category','shop','brand']
user.columns = ['userID','sex','age','ability']

data = reduce_mem_usage(data)

data = pd.merge(left=data, right=item, on='itemID',how='left')
data = pd.merge(left=data, right=user, on='userID',how='left')

return user, item, data



path = '../ECommAI_EUIR_round2_train_20190816/'
user, item, data = load_data(path = path)

train = data[data['day'] < 15]

start_timestamp  = max(train['timestamp'])

train['last_time'] = start_timestamp - train['timestamp']

timefeatures = []

for time_feature in ['itemID', 'shop', 'category','brand']:
name = time_feature + '_last_time_underline.csv'
tf = train[['last_time', time_feature]].groupby(
time_feature, as_index=False).agg({'last_time':'min'}).rename(columns={'last_time': time_feature + 'last_time'})
tf[time_feature + 'last_time_hour_ed'] = tf[time_feature + 'last_time'] // 3600 % 24
timefeatures.append((name, tf))

for f in timefeatures:
f[1].to_csv(f[0], index=False)