集团客户图谱以下图为例进行说明,如何获取企业A对企业D的控股比例呢?
采用图技术来计算,获得间接持股比例。
demo数据采用python中faker进行构造,主要生成关系数据、目标客户数据。
#生成控股比例数据
#edge_num生成多少条demo关系记录
def demo_data_(edge_num):
s = []
for i in range(edge_num):
#投资公司、被投资公司、投资比例、投资时间
s.append([fake.company(), fake.company(), random.random(), fake.date(pattern="%Y-%m-%d", end_datetime=None)])
demo_data = pd.DataFrame(s, columns=['start_company', 'end_company', 'weight', 'data_date'])
print("-----demo_data describe-----")
print(demo_data.info())
print("-----demo_data head---------")
print(demo_data.head())
return demo_data
#节点数据
def node_data_(node_num):
cust_list = [fake.company() for i in range(node_num)]
node_data = pd.DataFrame(cust_list, columns=['cust_id']).drop_duplicates()
print('节点数目', len(node_data['cust_id'].unique()))
node_data.to_csv('node_data.csv', index = False)
数据处理(使用了多线程multiprocessing)是一个经验与技术活,数据处理的好坏,影响着模型的结果,这里介绍以下几种方法:
#demeo数据处理
def rela_data_(demo_data):
print('原始数据记录数', len(demo_data))
#去除自投资
demo_data['bool'] = demo_data.apply(lambda x: if_same(x['start_company'], x['end_company']), axis=1)
demo_data = demo_data.loc[demo_data['bool'] != 1]
#去除非空
demo_data = demo_data[(demo_data['start_company'] != '')&(demo_data['end_company'] != '')]
#按照日期排序删除重复start_company、end_company项
demo_data = demo_data.sort_values(by=['start_company', 'end_company', 'data_date'], ascending=False).drop_duplicates(keep='first', subset=['start_company', 'end_company']).reset_index()
#删除多条大于0.5且保留最新值
demo_data = pd.concat([demo_data.loc[demo_data['weight'] <= 0.5], demo_data.loc[demo_data['weight'] > 0.5].sort_values(by=['end_company', 'data_date'], ascending=False).drop_duplicates(keep='first', subset=['end_company', 'weight'])]).reset_index()[['start_company', 'end_company', 'weight', 'data_date']]
#此时的demo_data_init用来归一化操作
global demo_data_init
demo_data_init = demo_data.copy()
#持股比例求和
demo_data_sum = demo_data[['end_company', 'weight']].groupby(['end_company']).sum()
#持股比例大于1的index
more_one_index = demo_data_sum.loc[demo_data_sum['weight']>1].index.unique()
print('持股比例大于1的index', len(more_one_index))
#并行处理持股比例大于1的数据归一化
#liunx中可以执行,windows上执行报错
items = more_one_index[:]
p = multiprocessing.Pool(32)
start = timeit.default_timer()
b = p.map(do_something, items)
p.close()
p.join()
end = timeit.default_timer()
print('multi processing time:', str(end-start), 's')
#持股比例大于1后的归一化结果
base_more_one = pd.read_csv('exchange.csv', header=None)
base_more_one.columns = ['start_company', 'end_company', 'weight', 'data_date']
#持股比例不大于1的index
low_one_index = demo_data_sum.loc[demo_data_sum['weight']<=1].index
base_low_one = pd.merge(demo_data, pd.DataFrame(low_one_index), on = ['end_company'], how = 'inner')
demo_data_final = pd.concat([base_low_one, base_more_one]).reset_index()[['start_company', 'end_company', 'weight', 'data_date']].drop_duplicates()
print('数据处理后记录数', len(demo_data_final))
demo_data_final.to_csv('demo_data_final.csv', index = False)
return demo_data_final
#并行处理函数
def do_something(i):
#大于1的pd
exchange = demo_data_init.loc[demo_data_init['end_company'] == i].sort_values(by=['end_company', 'data_date'], ascending=False)
#fundedratio
weight_sum = sum(exchange['weight'])
exchange['weight'] = exchange['weight']/weight_sum
exchange.to_csv('exchange.csv', encoding = 'utf-8', index = False, header = 0, mode = 'a')
print('-----End of The',i,'-----')
使用python中networkx构建边权重的有向图。
#构造有向图
def graph_(rela_data):
Graph = nx.DiGraph()
for indexs in rela_data.index:
Graph.add_weighted_edges_from([tuple(rela_data.loc[indexs].values)])
return Graph
global Graph
Graph = graph_(rela_data[['start_company', 'end_company', 'weight']].drop_duplicates())
print('图中节点数目', Graph.number_of_nodes())
print('图中关系数目', Graph.number_of_edges())
#获取(间接)控股比例矩阵
def sum_involution(ma, n_step):
#衰减参数
C = 1
mab = ma
result = ma
for _ in range(n_step-1):
ma = round(ma.dot(mab), 6)
np.fill_diagonal(ma.values,0,wrap=True)
result = result + C*ma
return result
七. 展望:
发现隐性关系,后续应用于集团划分,可采用louvain;
采用louvain时,如果有线下验证的集团标签,可以做监督学习,C 作为学习参数。
代码链接:
https://github.com/MO2T/1.Recognition_of_implicit_relationship