使用Python读取目录中的所有HTML文件并将内容写入CSV文件的步骤如下:
import os
import csv
from bs4 import BeautifulSoup
def read_html_file(file_path):
with open(file_path, 'r', encoding='utf-8') as file:
html_content = file.read()
soup = BeautifulSoup(html_content, 'html.parser')
# 在这里根据HTML结构提取所需的内容
# 示例:假设需要提取标题和正文内容
title = soup.find('title').text
body = soup.find('body').text
return title, body
def process_html_files(directory):
html_files = [f for f in os.listdir(directory) if f.endswith('.html')]
data = []
for file in html_files:
file_path = os.path.join(directory, file)
title, body = read_html_file(file_path)
data.append([title, body])
return data
def write_to_csv(data, output_file):
with open(output_file, 'w', newline='', encoding='utf-8') as file:
writer = csv.writer(file)
writer.writerow(['Title', 'Body']) # 写入CSV文件的表头
writer.writerows(data) # 写入提取的内容
directory = '目录路径' # 替换为实际的目录路径
output_file = '输出文件路径.csv' # 替换为实际的输出文件路径
data = process_html_files(directory)
write_to_csv(data, output_file)
以上代码将遍历指定目录中的所有HTML文件,提取标题和正文内容,并将其写入CSV文件中。你可以根据实际需要修改提取内容的方式和CSV文件的表头。
领取专属 10元无门槛券
手把手带您无忧上云