写出这篇文章的原因主要是最近在看一本书《Python爬虫开发与实战-从入门到实战》里面提到了CSV
这个模块,我立马进行了尝试,发现非常好用,比之前的xlwt
好用多了。
关键是爬取到数据之后,整个存储数据的逻辑更容易理解(可能还是自己太菜吧?)本文中介绍的通过pandas
和CSV
模块对数据进行读写操作
# 1-pandas写入
import pandas as pd
data = [{"name":"yangming","age":32,"height":180,"address":"shenzhen"},
{"name":"xiaoming","age":24,"height":168,"address":"guangzhou"},
{"name":"zhoujun","age":29,"height":184,"address":"shanghai"},
{"name":"zhangshan","age":20,"height":170,"address":"changsha"}
]
df = pd.DataFrame(data)
df
.dataframe tbody tr th:only-of-type { vertical-align: middle; } <pre><code>.dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; } </code></pre>
name | age | height | address | |
---|---|---|---|---|
0 | yangming | 32 | 180 | shenzhen |
1 | xiaoming | 24 | 168 | guangzhou |
2 | zhoujun | 29 | 184 | shanghai |
3 | zhangshan | 20 | 170 | changsha |
# 将DataFrame存储为csv,index表示是否显示行名,default=True
df.to_csv("tocsvfile-pandas.csv",sep=",")
pd.read_csv("tocsvfile-pandas.csv")
.dataframe tbody tr th:only-of-type { vertical-align: middle; } <pre><code>.dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; } </code></pre>
Unnamed: 0 | name | age | height | address | |
---|---|---|---|---|---|
0 | 0 | yangming | 32 | 180 | shenzhen |
1 | 1 | xiaoming | 24 | 168 | guangzhou |
2 | 2 | zhoujun | 29 | 184 | shanghai |
3 | 3 | zhangshan | 20 | 170 | changsha |
# !!!如何理解index参数
df.to_csv("tocsvfile-pandas-1.csv",index=False,sep=",")
pd.read_csv("tocsvfile-pandas-1.csv")
.dataframe tbody tr th:only-of-type { vertical-align: middle; } <pre><code>.dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; } </code></pre>
name | age | height | address | |
---|---|---|---|---|
0 | yangming | 32 | 180 | shenzhen |
1 | xiaoming | 24 | 168 | guangzhou |
2 | zhoujun | 29 | 184 | shanghai |
3 | zhangshan | 20 | 170 | changsha |
### csv通过字典形式存储文件
import csv
data = [{"name":"yangming","age":32,"height":180,"address":"shenzhen"},
{"name":"xiaoming","age":24,"height":168,"address":"guangzhou"},
{"name":"zhoujun","age":29,"height":184,"address":"shanghai"},
{"name":"zhangshan","age":20,"height":170,"address":"changsha"}
]
with open("information.csv","w",encoding="utf-8") as f:
writer = csv.DictWriter(f,fieldnames=["name","age","height","address"])
writer.writeheader()
writer.writerows(data) # 写入整个数据data-----用writerows
writer.writerow({"name":"Peter","age":28,"height":176,"address":"shenzhen"}) # 单独写入一条数据----用writerow
import pandas as pd
data = pd.read_csv("information.csv")
data
.dataframe tbody tr th:only-of-type { vertical-align: middle; } <pre><code>.dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; } </code></pre>
name | age | height | address | |
---|---|---|---|---|
0 | yangming | 32 | 180 | shenzhen |
1 | xiaoming | 24 | 168 | guangzhou |
2 | zhoujun | 29 | 184 | shanghai |
3 | zhangshan | 20 | 170 | changsha |
4 | Peter | 28 | 176 | shenzhen |
# with 语句写在开头,防止属性字段重复写入文件中
# 例子:简书APP之旅
with open("information-1.csv",'a',newline ='',encoding="utf-8") as f:
writer = csv.DictWriter(f,fieldnames=["name","age","height","address"])
writer.writeheader()
for i in range(1,5):
name_list = ["xiaoming","yanghong","peter","Tom"] * i
age_list = [19,27,32,24] * i
height_list = [176,180,172,183] * i
address_list = ["shenzhen","guangzhou","shanghai","changsha"] * i
infomation_list = []
for j in range(len(name_list)):
information = {
"name":name_list[j],
"age":age_list[j],
"height":height_list[j],
"address":address_list[j]
}
infomation_list.append(information)
writer.writerows(infomation_list)
# 读取数据
import pandas as pd
data = pd.read_csv("information-1.csv")
data
.dataframe tbody tr th:only-of-type { vertical-align: middle; } <pre><code>.dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; } </code></pre>
name | age | height | address | |
---|---|---|---|---|
0 | xiaoming | 19 | 176 | shenzhen |
1 | yanghong | 27 | 180 | guangzhou |
2 | peter | 32 | 172 | shanghai |
3 | Tom | 24 | 183 | changsha |
4 | xiaoming | 19 | 176 | shenzhen |
5 | yanghong | 27 | 180 | guangzhou |
6 | peter | 32 | 172 | shanghai |
7 | Tom | 24 | 183 | changsha |
8 | xiaoming | 19 | 176 | shenzhen |
9 | yanghong | 27 | 180 | guangzhou |
10 | peter | 32 | 172 | shanghai |
11 | Tom | 24 | 183 | changsha |
12 | xiaoming | 19 | 176 | shenzhen |
13 | yanghong | 27 | 180 | guangzhou |
14 | peter | 32 | 172 | shanghai |
15 | Tom | 24 | 183 | changsha |
import csv
# 1-设置文件头
fileHeader = ["name", "score"]
# 2-待写入3行数据
d1 = ["Wang", "100"]
d2 = ["Li", "80"]
d3 = ["xiaosi","92"]
# 3-写入数据
f = open("instance_1.csv", "w")
writer = csv.writer(f) # 生成writer对象
# 写入的内容都是以列表的形式整体传入函数
# writer.writerows([fileHeader, d1, d2, d3]) # 这行等效于下面的三行代码
# 4-数据单个形式传入
writer.writerow(fileHeader)
writer.writerow(d1)
writer.writerow(d2)
writer.writerow(d3)
# 5-需要关闭文件!!!
f.close()
pd.read_csv("instance_1.csv")
.dataframe tbody tr th:only-of-type { vertical-align: middle; } <pre><code>.dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; } </code></pre>
name | score | |
---|---|---|
0 | Wang | 100 |
1 | Li | 80 |
2 | xiaosi | 92 |
import csv
# 文件头
fileHeader = ["name", "score"]
# 写入的两行数据
d1 = ["Wang", "100"]
d2 = ["Li", "80"]
d3 = ["xiaosi","92"]
# 写入数据
with open("instance_2.csv", "a") as f:
writer = csv.writer(f)
writer.writerows([fileHeader, d1, d2, d3])
pd.read_csv("instance_2.csv")
.dataframe tbody tr th:only-of-type { vertical-align: middle; } <pre><code>.dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; } </code></pre>
name | score | |
---|---|---|
0 | Wang | 100 |
1 | Li | 80 |
2 | xiaosi | 92 |
# 1-通过pandas读取文件
import pandas as pd
csvfile = pd.read_csv("information.csv")
csvfile
.dataframe tbody tr th:only-of-type { vertical-align: middle; } <pre><code>.dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; } </code></pre>
name | age | height | address | |
---|---|---|---|---|
0 | yangming | 32 | 180 | shenzhen |
1 | xiaoming | 24 | 168 | guangzhou |
2 | zhoujun | 29 | 184 | shanghai |
3 | zhangshan | 20 | 170 | changsha |
4 | Peter | 28 | 176 | shenzhen |
# 2-通过csv模块读取文件
import csv
with open("information-1.csv") as f:
csvfile = csv.reader(f)
for line in csvfile: # 不需要用readlines
print(line)
['name', 'age', 'height', 'address']
['xiaoming', '19', '176', 'shenzhen']
['yanghong', '27', '180', 'guangzhou']
['peter', '32', '172', 'shanghai']
['Tom', '24', '183', 'changsha']
['xiaoming', '19', '176', 'shenzhen']
['yanghong', '27', '180', 'guangzhou']
['peter', '32', '172', 'shanghai']
['Tom', '24', '183', 'changsha']
['xiaoming', '19', '176', 'shenzhen']
['yanghong', '27', '180', 'guangzhou']
['peter', '32', '172', 'shanghai']
['Tom', '24', '183', 'changsha']
['xiaoming', '19', '176', 'shenzhen']
['yanghong', '27', '180', 'guangzhou']
['peter', '32', '172', 'shanghai']
['Tom', '24', '183', 'changsha']