import pandas as pd
# 读取 CSV 文件
bus_info = pd.read_csv('./Processed_Beijing_Bus_Info.csv')
# 去重并重置索引
bus_info1 = bus_info.drop_duplicates().reset_index(drop=True)
# 去掉缺失值并重置索引
bus_info2 = bus_info1.dropna().reset_index(drop=True)
# 清洗“总里程”列
def clean_distance(distance):
# 如果存在 '|' 则分割
if '|' in distance:
distance = distance.split('|')[0]
if '咨询' in distance:
distance = distance.split('。')[0]
elif '线路咨询' in distance:
distance = distance.split('线路咨询')[0]
if distance == '':
distance = '没有标识'
elif '低峰间隔' in distance or '间隔' in distance:
distance = distance.split('。')[0] # 处理低峰间隔
elif ('本线路' in distance or '南沟村' in distance or '定点班车' in distance):
distance = '没有标识'
elif '线路' in distance:
if '工作日' in distance:
distance = distance.split('工作日')[0].split('。')[1]
elif '52' in distance:
distance = distance.split('。')[1]
else:
distance = distance.split('。')[0]
if '全程' not in distance:
distance = '没有标识'
else:
distance = distance.split('。')[0]
if '全程' not in distance:
distance = '没有标识'
return distance
# 应用清洗函数
bus_info2['licheng'] = bus_info2['licheng'].apply(clean_distance)
# 打印清洗后的数据前五行
print("\n清洗后的数据前五行:")
print(bus_info2['licheng'].head())
原创声明:本文系作者授权腾讯云开发者社区发表,未经许可,不得转载。
如有侵权,请联系 cloudcommunity@tencent.com 删除。
原创声明:本文系作者授权腾讯云开发者社区发表,未经许可,不得转载。
如有侵权,请联系 cloudcommunity@tencent.com 删除。