import pandas as pd
import numpy as np
import osos.getcwd()'D:\\Jupyter\\notebook\\Python数据清洗实战\\数据清洗之数据表处理'os.chdir('D:\\Jupyter\\notebook\\Python数据清洗实战\\数据')df1 = pd.read_csv('sam_tianchi_mum_baby.csv', encoding='utf-8', dtype=str)df1.head(5)<div>
<style scoped>
.dataframe tbody tr th:only-of-type { vertical-align: middle;}.dataframe tbody tr th { vertical-align: top;}.dataframe thead th { text-align: right;}</style>
<table border="1" class="dataframe">
<thead>
<tr style="text-align: right;"> <th></th> <th>user_id</th> <th>birthday</th> <th>gender</th></tr></thead>
<tbody>
<tr> <th>0</th> <td>2757</td> <td>20130311</td> <td>1</td></tr><tr> <th>1</th> <td>415971</td> <td>20121111</td> <td>0</td></tr><tr> <th>2</th> <td>1372572</td> <td>20120130</td> <td>1</td></tr><tr> <th>3</th> <td>10339332</td> <td>20110910</td> <td>0</td></tr><tr> <th>4</th> <td>10642245</td> <td>20130213</td> <td>0</td></tr></tbody>
</table>
</div>
df1.info()<class 'pandas.core.frame.DataFrame'>RangeIndex: 953 entries, 0 to 952Data columns (total 3 columns):user_id 953 non-null objectbirthday 953 non-null objectgender 953 non-null objectdtypes: object(3)memory usage: 22.4+ KB# 修改0为女性
df1.loc[df1['gender']=='0', 'gender'] = '女性'df1.loc[df1['gender']=='1', 'gender'] = '男性'
df1.loc[df1['gender']=='2', 'gender'] = '未知'df1.head(5)<div>
<style scoped>
.dataframe tbody tr th:only-of-type { vertical-align: middle;}.dataframe tbody tr th { vertical-align: top;}.dataframe thead th { text-align: right;}</style>
<table border="1" class="dataframe">
<thead>
<tr style="text-align: right;"> <th></th> <th>user_id</th> <th>birthday</th> <th>gender</th></tr></thead>
<tbody>
<tr> <th>0</th> <td>2757</td> <td>20130311</td> <td>男性</td></tr><tr> <th>1</th> <td>415971</td> <td>20121111</td> <td>女性</td></tr><tr> <th>2</th> <td>1372572</td> <td>20120130</td> <td>男性</td></tr><tr> <th>3</th> <td>10339332</td> <td>20110910</td> <td>女性</td></tr><tr> <th>4</th> <td>10642245</td> <td>20130213</td> <td>女性</td></tr></tbody>
</table>
</div>
# 对列标签进行修改,传入参数为字典形式
# 加 inplace=True,会对原数据进行修改
df1.rename(columns={'user_id': '用户ID', 'birthday': '出生日期', 'gender': '性别'}, inplace=True)df1.head(5)<div>
<style scoped>
.dataframe tbody tr th:only-of-type { vertical-align: middle;}.dataframe tbody tr th { vertical-align: top;}.dataframe thead th { text-align: right;}</style>
<table border="1" class="dataframe">
<thead>
<tr style="text-align: right;"> <th></th> <th>用户ID</th> <th>出生日期</th> <th>性别</th></tr></thead>
<tbody>
<tr> <th>0</th> <td>2757</td> <td>20130311</td> <td>男性</td></tr><tr> <th>1</th> <td>415971</td> <td>20121111</td> <td>女性</td></tr><tr> <th>2</th> <td>1372572</td> <td>20120130</td> <td>男性</td></tr><tr> <th>3</th> <td>10339332</td> <td>20110910</td> <td>女性</td></tr><tr> <th>4</th> <td>10642245</td> <td>20130213</td> <td>女性</td></tr></tbody>
</table>
</div>
# 修改行标签
df1.rename(index={3: 333, 4:444}, inplace=True)df1.head(5)<div>
<style scoped>
.dataframe tbody tr th:only-of-type { vertical-align: middle;}.dataframe tbody tr th { vertical-align: top;}.dataframe thead th { text-align: right;}</style>
<table border="1" class="dataframe">
<thead>
<tr style="text-align: right;"> <th></th> <th>用户ID</th> <th>出生日期</th> <th>性别</th></tr></thead>
<tbody>
<tr> <th>0</th> <td>2757</td> <td>20130311</td> <td>男性</td></tr><tr> <th>1</th> <td>415971</td> <td>20121111</td> <td>女性</td></tr><tr> <th>2</th> <td>1372572</td> <td>20120130</td> <td>男性</td></tr><tr> <th>333</th> <td>10339332</td> <td>20110910</td> <td>女性</td></tr><tr> <th>444</th> <td>10642245</td> <td>20130213</td> <td>女性</td></tr></tbody>
</table>
</div>
# iloc按照位置取值,与标签无关
df1.iloc[:5]<div>
<style scoped>
.dataframe tbody tr th:only-of-type { vertical-align: middle;}.dataframe tbody tr th { vertical-align: top;}.dataframe thead th { text-align: right;}</style>
<table border="1" class="dataframe">
<thead>
<tr style="text-align: right;"> <th></th> <th>用户ID</th> <th>出生日期</th> <th>性别</th></tr></thead>
<tbody>
<tr> <th>0</th> <td>2757</td> <td>20130311</td> <td>男性</td></tr><tr> <th>1</th> <td>415971</td> <td>20121111</td> <td>女性</td></tr><tr> <th>2</th> <td>1372572</td> <td>20120130</td> <td>男性</td></tr><tr> <th>333</th> <td>10339332</td> <td>20110910</td> <td>女性</td></tr><tr> <th>444</th> <td>10642245</td> <td>20130213</td> <td>女性</td></tr></tbody>
</table>
</div>
# 重置索引
df1.reset_index(drop=True, inplace=True)df1.head(5)<div>
<style scoped>
.dataframe tbody tr th:only-of-type { vertical-align: middle;}.dataframe tbody tr th { vertical-align: top;}.dataframe thead th { text-align: right;}</style>
<table border="1" class="dataframe">
<thead>
<tr style="text-align: right;"> <th></th> <th>用户ID</th> <th>出生日期</th> <th>性别</th></tr></thead>
<tbody>
<tr> <th>0</th> <td>2757</td> <td>20130311</td> <td>男性</td></tr><tr> <th>1</th> <td>415971</td> <td>20121111</td> <td>女性</td></tr><tr> <th>2</th> <td>1372572</td> <td>20120130</td> <td>男性</td></tr><tr> <th>3</th> <td>10339332</td> <td>20110910</td> <td>女性</td></tr><tr> <th>4</th> <td>10642245</td> <td>20130213</td> <td>女性</td></tr></tbody>
</table>
</div>
df = pd.read_csv('baby_trade_history.csv', encoding='utf-8', dtype={'user_id':str})df.head(2)<div>
<style scoped>
.dataframe tbody tr th:only-of-type { vertical-align: middle;}.dataframe tbody tr th { vertical-align: top;}.dataframe thead th { text-align: right;}</style>
<table border="1" class="dataframe">
<thead>
<tr style="text-align: right;"> <th></th> <th>user_id</th> <th>auction_id</th> <th>cat_id</th> <th>cat1</th> <th>property</th> <th>buy_mount</th> <th>day</th></tr></thead>
<tbody>
<tr> <th>0</th> <td>786295544</td> <td>41098319944</td> <td>50014866</td> <td>50022520</td> <td>21458:86755362;13023209:3593274;10984217:21985...</td> <td>2</td> <td>20140919</td></tr><tr> <th>1</th> <td>532110457</td> <td>17916191097</td> <td>50011993</td> <td>28</td> <td>21458:11399317;1628862:3251296;21475:137325;16...</td> <td>1</td> <td>20131011</td></tr></tbody>
</table>
</div>
# 选择buy_mount 大于10 的数据
# 返回的是布尔索引值
# 放入pandas dataframe 中,会自动筛选
# 数据太多,使用分片打印前5个
df[df['buy_mount'] > 10][:5]<div>
<style scoped>
.dataframe tbody tr th:only-of-type { vertical-align: middle;}.dataframe tbody tr th { vertical-align: top;}.dataframe thead th { text-align: right;}</style>
<table border="1" class="dataframe">
<thead>
<tr style="text-align: right;"> <th></th> <th>user_id</th> <th>auction_id</th> <th>cat_id</th> <th>cat1</th> <th>property</th> <th>buy_mount</th> <th>day</th></tr></thead>
<tbody>
<tr> <th>49</th> <td>103125167</td> <td>18426669796</td> <td>50018438</td> <td>50014815</td> <td>21458:46896;1628665:3233941;1628665:3233942;21...</td> <td>16</td> <td>20140220</td></tr><tr> <th>65</th> <td>605724983</td> <td>19747694834</td> <td>50006520</td> <td>50014815</td> <td>21458:30992</td> <td>12</td> <td>20141017</td></tr><tr> <th>89</th> <td>277279277</td> <td>18024521052</td> <td>211122</td> <td>38</td> <td>21458:33516;33480:3238774;2653417:7353464;3359...</td> <td>12</td> <td>20130513</td></tr><tr> <th>247</th> <td>392530596</td> <td>17001611735</td> <td>50011993</td> <td>28</td> <td>122218042:50276;21475:135183931;1628861:45151;...</td> <td>101</td> <td>20140301</td></tr><tr> <th>409</th> <td>1968453717</td> <td>12567034563</td> <td>122616024</td> <td>28</td> <td>135925585:42825;138052423:142000990;18822961:2...</td> <td>14</td> <td>20150127</td></tr></tbody>
</table>
</div>
# 选择buy_mount 小于等于10 的数据
df[~(df['buy_mount'] > 10)][:5]<div>
<style scoped>
.dataframe tbody tr th:only-of-type { vertical-align: middle;}.dataframe tbody tr th { vertical-align: top;}.dataframe thead th { text-align: right;}</style>
<table border="1" class="dataframe">
<thead>
<tr style="text-align: right;"> <th></th> <th>user_id</th> <th>auction_id</th> <th>cat_id</th> <th>cat1</th> <th>property</th> <th>buy_mount</th> <th>day</th></tr></thead>
<tbody>
<tr> <th>0</th> <td>786295544</td> <td>41098319944</td> <td>50014866</td> <td>50022520</td> <td>21458:86755362;13023209:3593274;10984217:21985...</td> <td>2</td> <td>20140919</td></tr><tr> <th>1</th> <td>532110457</td> <td>17916191097</td> <td>50011993</td> <td>28</td> <td>21458:11399317;1628862:3251296;21475:137325;16...</td> <td>1</td> <td>20131011</td></tr><tr> <th>2</th> <td>249013725</td> <td>21896936223</td> <td>50012461</td> <td>50014815</td> <td>21458:30992;1628665:92012;1628665:3233938;1628...</td> <td>1</td> <td>20131011</td></tr><tr> <th>3</th> <td>917056007</td> <td>12515996043</td> <td>50018831</td> <td>50014815</td> <td>21458:15841995;21956:3494076;27000458:59723383...</td> <td>2</td> <td>20141023</td></tr><tr> <th>4</th> <td>444069173</td> <td>20487688075</td> <td>50013636</td> <td>50008168</td> <td>21458:30992;13658074:3323064;1628665:3233941;1...</td> <td>1</td> <td>20141103</td></tr></tbody>
</table>
</div>
# 多条件查询
# 每个条件用括号括起来
df[(df['buy_mount'] > 10) & (df['day'] > 20140101)][:5]<div>
<style scoped>
.dataframe tbody tr th:only-of-type { vertical-align: middle;}.dataframe tbody tr th { vertical-align: top;}.dataframe thead th { text-align: right;}</style>
<table border="1" class="dataframe">
<thead>
<tr style="text-align: right;"> <th></th> <th>user_id</th> <th>auction_id</th> <th>cat_id</th> <th>cat1</th> <th>property</th> <th>buy_mount</th> <th>day</th></tr></thead>
<tbody>
<tr> <th>49</th> <td>103125167</td> <td>18426669796</td> <td>50018438</td> <td>50014815</td> <td>21458:46896;1628665:3233941;1628665:3233942;21...</td> <td>16</td> <td>20140220</td></tr><tr> <th>65</th> <td>605724983</td> <td>19747694834</td> <td>50006520</td> <td>50014815</td> <td>21458:30992</td> <td>12</td> <td>20141017</td></tr><tr> <th>247</th> <td>392530596</td> <td>17001611735</td> <td>50011993</td> <td>28</td> <td>122218042:50276;21475:135183931;1628861:45151;...</td> <td>101</td> <td>20140301</td></tr><tr> <th>409</th> <td>1968453717</td> <td>12567034563</td> <td>122616024</td> <td>28</td> <td>135925585:42825;138052423:142000990;18822961:2...</td> <td>14</td> <td>20150127</td></tr><tr> <th>462</th> <td>1802549062</td> <td>17383345857</td> <td>50006520</td> <td>50014815</td> <td>22277:6262384;21458:30992;1628665:61550;162866...</td> <td>11</td> <td>20141120</td></tr></tbody>
</table>
</div>
# between方法不适用于字符串数据
df[df['buy_mount'].between(4, 10, inclusive=True)][:5]<div>
<style scoped>
.dataframe tbody tr th:only-of-type { vertical-align: middle;}.dataframe tbody tr th { vertical-align: top;}.dataframe thead th { text-align: right;}</style>
<table border="1" class="dataframe">
<thead>
<tr style="text-align: right;"> <th></th> <th>user_id</th> <th>auction_id</th> <th>cat_id</th> <th>cat1</th> <th>property</th> <th>buy_mount</th> <th>day</th></tr></thead>
<tbody>
<tr> <th>22</th> <td>469517728</td> <td>8232924597</td> <td>211122</td> <td>38</td> <td>21458:21782;36786:42781029;13023102:6999219;22...</td> <td>6</td> <td>20140502</td></tr><tr> <th>52</th> <td>55544814</td> <td>4917672059</td> <td>50015727</td> <td>50014815</td> <td>21458:4540492;1633959:58840623;7107736:3227806...</td> <td>4</td> <td>20131106</td></tr><tr> <th>117</th> <td>296448405</td> <td>18524578446</td> <td>50016030</td> <td>50008168</td> <td>21458:247918101;1628665:29782;1628665:29784;16...</td> <td>7</td> <td>20131202</td></tr><tr> <th>134</th> <td>97481514</td> <td>41161316434</td> <td>211122</td> <td>38</td> <td>6940834:29865;21458:4331527;1804977:606613769;...</td> <td>6</td> <td>20141126</td></tr><tr> <th>148</th> <td>662134541</td> <td>7594318922</td> <td>211122</td> <td>38</td> <td>21458:21776;36780:15333590;2675455:19653564;69...</td> <td>6</td> <td>20131211</td></tr></tbody>
</table>
</div>
df.dtypesuser_id objectauction_id int64cat_id int64cat1 int64property objectbuy_mount int64day int64dtype: object# 多个值放入列表中
df[df['cat1'].isin([38,28])][:5]<div>
<style scoped>
.dataframe tbody tr th:only-of-type { vertical-align: middle;}.dataframe tbody tr th { vertical-align: top;}.dataframe thead th { text-align: right;}</style>
<table border="1" class="dataframe">
<thead>
<tr style="text-align: right;"> <th></th> <th>user_id</th> <th>auction_id</th> <th>cat_id</th> <th>cat1</th> <th>property</th> <th>buy_mount</th> <th>day</th></tr></thead>
<tbody>
<tr> <th>1</th> <td>532110457</td> <td>17916191097</td> <td>50011993</td> <td>28</td> <td>21458:11399317;1628862:3251296;21475:137325;16...</td> <td>1</td> <td>20131011</td></tr><tr> <th>8</th> <td>82830661</td> <td>19948600790</td> <td>50013874</td> <td>28</td> <td>21458:11580;21475:137325</td> <td>1</td> <td>20121101</td></tr><tr> <th>9</th> <td>475046636</td> <td>10368360710</td> <td>203527</td> <td>28</td> <td>22724:40168;22729:40278;21458:21817;2770200:24...</td> <td>1</td> <td>20121101</td></tr><tr> <th>10</th> <td>734147966</td> <td>15307958346</td> <td>50018202</td> <td>38</td> <td>21458:3270827;7361532:28710594;7397093:7536994...</td> <td>2</td> <td>20121101</td></tr><tr> <th>13</th> <td>377550424</td> <td>15771663914</td> <td>50015841</td> <td>28</td> <td>1628665:3233941;1628665:3233942;3914866:11580;...</td> <td>1</td> <td>20121123</td></tr></tbody>
</table>
</div>
原创声明:本文系作者授权腾讯云开发者社区发表,未经许可,不得转载。
如有侵权,请联系 cloudcommunity@tencent.com 删除。
原创声明:本文系作者授权腾讯云开发者社区发表,未经许可,不得转载。
如有侵权,请联系 cloudcommunity@tencent.com 删除。