尝试使用 beautiful soup 导出从 HTML 获取的数据
Trying to export data taken from HTML using beautiful soup
我正在尝试从 HTML 文本中提取信息,这些文本是从我从 For 循环创建的 URL 中获取的,然后使用漂亮的汤。
我可以正确地隔离信息,但是当我尝试导出数据时,我收到一条错误消息“所有数组的长度必须相同”
weblink = []
filing_type = []
company_name = []
date = []
#Importing file
df = pd.read_csv('Downloads\Dropped_Companies.csv')
#Getting companie's names into list
companies_column=list(df.columns.values)[4]
name_ = df[companies_column].tolist()
#Formatting company's names for creating URLs
for CompanyName in name_:
company_name.append(CompanyName.lower().replace(" ",'_'))
company_name
for item in range(0, len(company_name)):
link = 'https://www.sec.gov/cgi-bin/browse-edgar?action=getcompany&company=' + company_name[item] + '&type=10-K&dateb=&owner=exclude&count=100'
#Getting the HTML text
headers = random.choice(headers_list)
r = requests.Session()
r.headers = headers
html = r.get(link).text
#Calling beautiful soup for better HTML text
soup = bs.BeautifulSoup(html)
tet_ = soup.find_all("a", id = "documentsbutton")
#Get the links
for link in tet_:
weblink.append('https://www.sec.gov' + link.get('href'))
test11 = soup.find_all("table", class_= "tableFile2")
for link in test11:
row = link.find_all("td", nowrap = "nowrap")
for i in range(0, len(row), 3):
filing_type.append(row[i].getText())
date.append(link.find("td", class_ = "small").find_next_sibling("td").text)
name.append(company_name[item])
data = {'Company Name':name,'Filing Date': date,'Filing Type':filing_type,"Weblink":weblink}
outputdf = pd.DataFrame(data)
outputdf.to_csv('Downloads/t_10KLinks.csv')
data = {'Company Name':name,'Filing Date': date,'Filing Type':filing_type,"Weblink":weblink}
outputdf = pd.DataFrame.from_dict(data, orient='index')
outputdf.to_csv('Downloads/t_10KLinks.csv')
我正在尝试从 HTML 文本中提取信息,这些文本是从我从 For 循环创建的 URL 中获取的,然后使用漂亮的汤。
我可以正确地隔离信息,但是当我尝试导出数据时,我收到一条错误消息“所有数组的长度必须相同”
weblink = []
filing_type = []
company_name = []
date = []
#Importing file
df = pd.read_csv('Downloads\Dropped_Companies.csv')
#Getting companie's names into list
companies_column=list(df.columns.values)[4]
name_ = df[companies_column].tolist()
#Formatting company's names for creating URLs
for CompanyName in name_:
company_name.append(CompanyName.lower().replace(" ",'_'))
company_name
for item in range(0, len(company_name)):
link = 'https://www.sec.gov/cgi-bin/browse-edgar?action=getcompany&company=' + company_name[item] + '&type=10-K&dateb=&owner=exclude&count=100'
#Getting the HTML text
headers = random.choice(headers_list)
r = requests.Session()
r.headers = headers
html = r.get(link).text
#Calling beautiful soup for better HTML text
soup = bs.BeautifulSoup(html)
tet_ = soup.find_all("a", id = "documentsbutton")
#Get the links
for link in tet_:
weblink.append('https://www.sec.gov' + link.get('href'))
test11 = soup.find_all("table", class_= "tableFile2")
for link in test11:
row = link.find_all("td", nowrap = "nowrap")
for i in range(0, len(row), 3):
filing_type.append(row[i].getText())
date.append(link.find("td", class_ = "small").find_next_sibling("td").text)
name.append(company_name[item])
data = {'Company Name':name,'Filing Date': date,'Filing Type':filing_type,"Weblink":weblink}
outputdf = pd.DataFrame(data)
outputdf.to_csv('Downloads/t_10KLinks.csv')
data = {'Company Name':name,'Filing Date': date,'Filing Type':filing_type,"Weblink":weblink}
outputdf = pd.DataFrame.from_dict(data, orient='index')
outputdf.to_csv('Downloads/t_10KLinks.csv')