为公司报废年度报告
Scrapping Annual Reports for Companies
我正在从网站上提取年度报告 pdf 文件。
import requests
import pandas as pd
from bs4 import BeautifulSoup
url1 = "https://investor.alaskaair.com/financial-information/sec-filings?field_nir_sec_form_group_target_id%5B%5D=471&field_nir_sec_date_filed_value=#views-exposed-form-widget-sec-filings-table"
source = requests.get(url1)
soup = BeautifulSoup(source.text , "html.parser")
我正在尝试从上面提到的 URL 中提取列,但在视图列中我们只有 8 行,因此抛出值错误 = 数组的长度必须相同
<div class="snippet" data-lang="js" data-hide="false" data-console="true" data-babel="false">
<div class="snippet-code">
<pre><code>tag2 = soup.find_all('div' , class_="field field--name-field-nir-sec-form-group field--type-entity-reference field--label-inline field__item")
def filing_group(tag2):
filing_group = []
for i in tag2:
filing_group.append(i.text.strip())
return filing_group
filing_group(tag2)
tag4 = soup.find_all('span' , class_ = "file file--mime-application-pdf file--application-pdf")
def view(tag4):
view = []
try:
for i in tag4:
view.append(i.a.get('href'))
except AttributeError:
view.append(None)
return view
view(tag4)
def scrape_page():
all_info = {}
all_info = {
"Filing Group" : [],
"View" : []
}
all_info["Filing Group"] += filing_group(tag2)
all_info["View"] += view(tag4)
return all_info
scrape_page_df = pd.DataFrame(scrape_page())
我正在从网站上提取年度报告 pdf 文件。
import requests
import pandas as pd
from bs4 import BeautifulSoup
url1 = "https://investor.alaskaair.com/financial-information/sec-filings?field_nir_sec_form_group_target_id%5B%5D=471&field_nir_sec_date_filed_value=#views-exposed-form-widget-sec-filings-table"
source = requests.get(url1)
soup = BeautifulSoup(source.text , "html.parser")
我正在尝试从上面提到的 URL 中提取列,但在视图列中我们只有 8 行,因此抛出值错误 = 数组的长度必须相同
<div class="snippet" data-lang="js" data-hide="false" data-console="true" data-babel="false">
<div class="snippet-code">
<pre><code>tag2 = soup.find_all('div' , class_="field field--name-field-nir-sec-form-group field--type-entity-reference field--label-inline field__item")
def filing_group(tag2):
filing_group = []
for i in tag2:
filing_group.append(i.text.strip())
return filing_group
filing_group(tag2)
tag4 = soup.find_all('span' , class_ = "file file--mime-application-pdf file--application-pdf")
def view(tag4):
view = []
try:
for i in tag4:
view.append(i.a.get('href'))
except AttributeError:
view.append(None)
return view
view(tag4)
def scrape_page():
all_info = {}
all_info = {
"Filing Group" : [],
"View" : []
}
all_info["Filing Group"] += filing_group(tag2)
all_info["View"] += view(tag4)
return all_info
scrape_page_df = pd.DataFrame(scrape_page())