无法将某些内容保存到 csv 文件
Having trouble into saving something to a csv file
我的程序做了我想做的一切,但没有将最终数据保存到 csv 文件,我在它之前使用了一个 print 来查看数据是否正确,它是,它只是没有写入 csv文件,我正在使用 'a'
因为我不希望它重写已经写入的内容,但它仍然返回错误。
这是代码的一部分:
soup = BeautifulSoup(answer)
for table in soup.findAll('table', {"class":"formTable"}):
for row in table.findAll('tr'):
#heading = row.find('td', {"class":"sectionHeading"})
#if heading is not None:
#print(heading.get_text());
#else:
label = row.find('td', {"class":"fieldLabel"})
data = row.find('td', {"class":"fieldData"})
if data is not None and label is not None:
csvline += label.get_text() + "," + data.get_text() + ","
print(csvline)
#csvline.encode('utf-8')
with open ('output_file_two.csv', 'a', encoding='utf-8') as f:
writer = csv.writer(f)
writer.writerow(csvline)
这是错误:
Traceback (most recent call last):
File "C:\PROJECT\pdfs\final.py", line 95, in <module>
with open ('output_file_two.csv', 'a', encoding='utf-8') as f:
TypeError: 'encoding' is an invalid keyword argument for this function
这里是完整的程序代码,以备不时之需
import shlex
import subprocess
import os
import platform
from bs4 import BeautifulSoup
import re
#import unicodecsv as csv
import csv
#import pickle
import requests
from robobrowser import RoboBrowser
import codecs
def rename_files():
file_list = os.listdir(r"C:\PROJECT\pdfs")
print(file_list)
saved_path = os.getcwd()
print('Current working directory is '+saved_path)
os.chdir(r'C:\PROJECT\pdfs')
for file_name in file_list:
os.rename(file_name, file_name.translate(None, " "))
os.chdir(saved_path)
rename_files()
def run(command):
if platform.system() != 'Windows':
args = shlex.split(command)
else:
args = command
s = subprocess.Popen(args,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE)
output, errors = s.communicate()
return s.returncode == 0, output, errors
# Change this to your PDF file base directory
base_directory = 'C:\PROJECT\pdfs'
if not os.path.isdir(base_directory):
print "%s is not a directory" % base_directory
exit(1)
# Change this to your pdf2htmlEX executable location
bin_path = 'C:\Python27\pdfminer-20140328\tools\pdf2txt.py'
if not os.path.isfile(bin_path):
print "Could not find %s" % bin_path
exit(1)
for dir_path, dir_name_list, file_name_list in os.walk(base_directory):
for file_name in file_name_list:
# If this is not a PDF file
if not file_name.endswith('.pdf'):
# Skip it
continue
file_path = os.path.join(dir_path, file_name)
# Convert your PDF to HTML here
args = (bin_path, file_name, file_path)
success, output, errors = run("python %s -o %s.html %s " %args)
if not success:
print "Could not convert %s to HTML" % file_path
print "%s" % errors
htmls_path = 'C:\PROJECT'
with open ('score.csv', 'w') as f:
writer = csv.writer(f)
for dir_path, dir_name_list, file_name_list in os.walk(htmls_path):
for file_name in file_name_list:
if not file_name.endswith('.html'):
continue
with open(file_name) as markup:
soup = BeautifulSoup(markup.read())
text = soup.get_text()
match = re.findall("PA/(\S*)", text)#To remove the names that appear, just remove the last (\S*), to add them is just add the (\S*), before it there was a \s*
print(match)
writer.writerow(match)
for item in match:
data = item.split('/')
case_number = data[0]
case_year = data[1]
csvline = case_number + ","
browser = RoboBrowser()
browser.open('http://www.pa.org.mt/page.aspx?n=63C70E73&CaseType=PA')
form = browser.get_forms()[0] # Get the first form on the page
form['ctl00$PageContent$ContentControl$ctl00$txtCaseNo'].value = case_number
form['ctl00$PageContent$ContentControl$ctl00$txtCaseYear'].value = case_year
browser.submit_form(form, submit=form['ctl00$PageContent$ContentControl$ctl00$btnSubmit'])
# Use BeautifulSoup to parse this data
answer = browser.response.text
#print(answer)
soup = BeautifulSoup(answer)
for table in soup.findAll('table', {"class":"formTable"}):
for row in table.findAll('tr'):
#heading = row.find('td', {"class":"sectionHeading"})
#if heading is not None:
#print(heading.get_text());
#else:
label = row.find('td', {"class":"fieldLabel"})
data = row.find('td', {"class":"fieldData"})
if data is not None and label is not None:
csvline += label.get_text() + "," + data.get_text() + ","
print(csvline)
with open ('output_file_two.csv', 'a') as f:
writer = csv.writer(f)
writer.writerow(csvline)
编辑
它正在运行,这是运行的代码
import shlex
import subprocess
import os
import platform
from bs4 import BeautifulSoup
import re
import unicodecsv as csv
import requests
from robobrowser import RoboBrowser
import codecs
def rename_files():
file_list = os.listdir(r"C:\PROJECT\pdfs")
print(file_list)
saved_path = os.getcwd()
print('Current working directory is '+saved_path)
os.chdir(r'C:\PROJECT\pdfs')
for file_name in file_list:
os.rename(file_name, file_name.translate(None, " "))
os.chdir(saved_path)
rename_files()
def run(command):
if platform.system() != 'Windows':
args = shlex.split(command)
else:
args = command
s = subprocess.Popen(args,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE)
output, errors = s.communicate()
return s.returncode == 0, output, errors
base_directory = 'C:\PROJECT\pdfs'
if not os.path.isdir(base_directory):
print "%s is not a directory" % base_directory
exit(1)
bin_path = 'C:\Python27\pdfminer-20140328\tools\pdf2txt.py'
if not os.path.isfile(bin_path):
print "Could not find %s" % bin_path
exit(1)
for dir_path, dir_name_list, file_name_list in os.walk(base_directory):
for file_name in file_name_list:
if not file_name.endswith('.pdf'):
continue
file_path = os.path.join(dir_path, file_name)
args = (bin_path, file_name, file_path)
success, output, errors = run("python %s -o %s.html %s " %args)
if not success:
print "Could not convert %s to HTML" % file_path
print "%s" % errors
htmls_path = 'C:\PROJECT'
with open ('score.csv', 'w') as f:
writer = csv.writer(f)
for dir_path, dir_name_list, file_name_list in os.walk(htmls_path):
for file_name in file_name_list:
if not file_name.endswith('.html'):
continue
with open(file_name) as markup:
soup = BeautifulSoup(markup.read())
text = soup.get_text()
match = re.findall("PA/(\S*)", text)
print(match)
writer.writerow(match)
for item in match:
data = item.split('/')
case_number = data[0]
case_year = data[1]
csvline = case_number + ","
browser = RoboBrowser()
browser.open('http://www.pa.org.mt/page.aspx?n=63C70E73&CaseType=PA')
form = browser.get_forms()[0]
form['ctl00$PageContent$ContentControl$ctl00$txtCaseNo'].value = case_number
form['ctl00$PageContent$ContentControl$ctl00$txtCaseYear'].value = case_year
browser.submit_form(form, submit=form['ctl00$PageContent$ContentControl$ctl00$btnSubmit'])
answer = browser.response.text
soup = BeautifulSoup(answer)
for table in soup.findAll('table', {"class":"formTable"}):
for row in table.findAll('tr'):
label = row.find('td', {"class":"fieldLabel"})
data = row.find('td', {"class":"fieldData"})
if data is not None and label is not None:
csvline += label.get_text() + "," + data.get_text() + ","
print(csvline)
my_file = codecs.open('final_output.csv', 'a', 'utf-8')
my_file.write(csvline)
最后你的代码有问题
writer = csv.writer(f)
csv.writer(csvline) # here is the problem
看到你初始化了writer,但是后来你没有使用它。
writer = csv.writer(f)
writer.writerow(csvline)
此处:
with open ('output_file_two.csv', 'a') as f:
writer = csv.writer(f)
csv.writer (csvline)
您正在实例化 csv.writer
,但并未使用它。这应该是:
with open ('output_file_two.csv', 'a') as f:
writer = csv.writer(f)
writer.write(csvline)
现在您的代码还有很多其他问题,第一个是手动将 'csvline 创建为文本,然后使用 csv.writer
将其存储到文件中。 csv.writer.write()
需要一个行列表(元组)并负责正确转义需要转义的内容,插入正确的定界符等。它还有一个 writerow()
方法接受单个元组,因此避免构建内存 FWIW 中的整个列表。
我的程序做了我想做的一切,但没有将最终数据保存到 csv 文件,我在它之前使用了一个 print 来查看数据是否正确,它是,它只是没有写入 csv文件,我正在使用 'a'
因为我不希望它重写已经写入的内容,但它仍然返回错误。
这是代码的一部分:
soup = BeautifulSoup(answer)
for table in soup.findAll('table', {"class":"formTable"}):
for row in table.findAll('tr'):
#heading = row.find('td', {"class":"sectionHeading"})
#if heading is not None:
#print(heading.get_text());
#else:
label = row.find('td', {"class":"fieldLabel"})
data = row.find('td', {"class":"fieldData"})
if data is not None and label is not None:
csvline += label.get_text() + "," + data.get_text() + ","
print(csvline)
#csvline.encode('utf-8')
with open ('output_file_two.csv', 'a', encoding='utf-8') as f:
writer = csv.writer(f)
writer.writerow(csvline)
这是错误:
Traceback (most recent call last):
File "C:\PROJECT\pdfs\final.py", line 95, in <module>
with open ('output_file_two.csv', 'a', encoding='utf-8') as f:
TypeError: 'encoding' is an invalid keyword argument for this function
这里是完整的程序代码,以备不时之需
import shlex
import subprocess
import os
import platform
from bs4 import BeautifulSoup
import re
#import unicodecsv as csv
import csv
#import pickle
import requests
from robobrowser import RoboBrowser
import codecs
def rename_files():
file_list = os.listdir(r"C:\PROJECT\pdfs")
print(file_list)
saved_path = os.getcwd()
print('Current working directory is '+saved_path)
os.chdir(r'C:\PROJECT\pdfs')
for file_name in file_list:
os.rename(file_name, file_name.translate(None, " "))
os.chdir(saved_path)
rename_files()
def run(command):
if platform.system() != 'Windows':
args = shlex.split(command)
else:
args = command
s = subprocess.Popen(args,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE)
output, errors = s.communicate()
return s.returncode == 0, output, errors
# Change this to your PDF file base directory
base_directory = 'C:\PROJECT\pdfs'
if not os.path.isdir(base_directory):
print "%s is not a directory" % base_directory
exit(1)
# Change this to your pdf2htmlEX executable location
bin_path = 'C:\Python27\pdfminer-20140328\tools\pdf2txt.py'
if not os.path.isfile(bin_path):
print "Could not find %s" % bin_path
exit(1)
for dir_path, dir_name_list, file_name_list in os.walk(base_directory):
for file_name in file_name_list:
# If this is not a PDF file
if not file_name.endswith('.pdf'):
# Skip it
continue
file_path = os.path.join(dir_path, file_name)
# Convert your PDF to HTML here
args = (bin_path, file_name, file_path)
success, output, errors = run("python %s -o %s.html %s " %args)
if not success:
print "Could not convert %s to HTML" % file_path
print "%s" % errors
htmls_path = 'C:\PROJECT'
with open ('score.csv', 'w') as f:
writer = csv.writer(f)
for dir_path, dir_name_list, file_name_list in os.walk(htmls_path):
for file_name in file_name_list:
if not file_name.endswith('.html'):
continue
with open(file_name) as markup:
soup = BeautifulSoup(markup.read())
text = soup.get_text()
match = re.findall("PA/(\S*)", text)#To remove the names that appear, just remove the last (\S*), to add them is just add the (\S*), before it there was a \s*
print(match)
writer.writerow(match)
for item in match:
data = item.split('/')
case_number = data[0]
case_year = data[1]
csvline = case_number + ","
browser = RoboBrowser()
browser.open('http://www.pa.org.mt/page.aspx?n=63C70E73&CaseType=PA')
form = browser.get_forms()[0] # Get the first form on the page
form['ctl00$PageContent$ContentControl$ctl00$txtCaseNo'].value = case_number
form['ctl00$PageContent$ContentControl$ctl00$txtCaseYear'].value = case_year
browser.submit_form(form, submit=form['ctl00$PageContent$ContentControl$ctl00$btnSubmit'])
# Use BeautifulSoup to parse this data
answer = browser.response.text
#print(answer)
soup = BeautifulSoup(answer)
for table in soup.findAll('table', {"class":"formTable"}):
for row in table.findAll('tr'):
#heading = row.find('td', {"class":"sectionHeading"})
#if heading is not None:
#print(heading.get_text());
#else:
label = row.find('td', {"class":"fieldLabel"})
data = row.find('td', {"class":"fieldData"})
if data is not None and label is not None:
csvline += label.get_text() + "," + data.get_text() + ","
print(csvline)
with open ('output_file_two.csv', 'a') as f:
writer = csv.writer(f)
writer.writerow(csvline)
编辑
它正在运行,这是运行的代码
import shlex
import subprocess
import os
import platform
from bs4 import BeautifulSoup
import re
import unicodecsv as csv
import requests
from robobrowser import RoboBrowser
import codecs
def rename_files():
file_list = os.listdir(r"C:\PROJECT\pdfs")
print(file_list)
saved_path = os.getcwd()
print('Current working directory is '+saved_path)
os.chdir(r'C:\PROJECT\pdfs')
for file_name in file_list:
os.rename(file_name, file_name.translate(None, " "))
os.chdir(saved_path)
rename_files()
def run(command):
if platform.system() != 'Windows':
args = shlex.split(command)
else:
args = command
s = subprocess.Popen(args,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE)
output, errors = s.communicate()
return s.returncode == 0, output, errors
base_directory = 'C:\PROJECT\pdfs'
if not os.path.isdir(base_directory):
print "%s is not a directory" % base_directory
exit(1)
bin_path = 'C:\Python27\pdfminer-20140328\tools\pdf2txt.py'
if not os.path.isfile(bin_path):
print "Could not find %s" % bin_path
exit(1)
for dir_path, dir_name_list, file_name_list in os.walk(base_directory):
for file_name in file_name_list:
if not file_name.endswith('.pdf'):
continue
file_path = os.path.join(dir_path, file_name)
args = (bin_path, file_name, file_path)
success, output, errors = run("python %s -o %s.html %s " %args)
if not success:
print "Could not convert %s to HTML" % file_path
print "%s" % errors
htmls_path = 'C:\PROJECT'
with open ('score.csv', 'w') as f:
writer = csv.writer(f)
for dir_path, dir_name_list, file_name_list in os.walk(htmls_path):
for file_name in file_name_list:
if not file_name.endswith('.html'):
continue
with open(file_name) as markup:
soup = BeautifulSoup(markup.read())
text = soup.get_text()
match = re.findall("PA/(\S*)", text)
print(match)
writer.writerow(match)
for item in match:
data = item.split('/')
case_number = data[0]
case_year = data[1]
csvline = case_number + ","
browser = RoboBrowser()
browser.open('http://www.pa.org.mt/page.aspx?n=63C70E73&CaseType=PA')
form = browser.get_forms()[0]
form['ctl00$PageContent$ContentControl$ctl00$txtCaseNo'].value = case_number
form['ctl00$PageContent$ContentControl$ctl00$txtCaseYear'].value = case_year
browser.submit_form(form, submit=form['ctl00$PageContent$ContentControl$ctl00$btnSubmit'])
answer = browser.response.text
soup = BeautifulSoup(answer)
for table in soup.findAll('table', {"class":"formTable"}):
for row in table.findAll('tr'):
label = row.find('td', {"class":"fieldLabel"})
data = row.find('td', {"class":"fieldData"})
if data is not None and label is not None:
csvline += label.get_text() + "," + data.get_text() + ","
print(csvline)
my_file = codecs.open('final_output.csv', 'a', 'utf-8')
my_file.write(csvline)
最后你的代码有问题
writer = csv.writer(f)
csv.writer(csvline) # here is the problem
看到你初始化了writer,但是后来你没有使用它。
writer = csv.writer(f)
writer.writerow(csvline)
此处:
with open ('output_file_two.csv', 'a') as f:
writer = csv.writer(f)
csv.writer (csvline)
您正在实例化 csv.writer
,但并未使用它。这应该是:
with open ('output_file_two.csv', 'a') as f:
writer = csv.writer(f)
writer.write(csvline)
现在您的代码还有很多其他问题,第一个是手动将 'csvline 创建为文本,然后使用 csv.writer
将其存储到文件中。 csv.writer.write()
需要一个行列表(元组)并负责正确转义需要转义的内容,插入正确的定界符等。它还有一个 writerow()
方法接受单个元组,因此避免构建内存 FWIW 中的整个列表。