为发生循环的实例分配编号
Assign number for instance that loop occured
我正在尝试为循环为每个实例收集数据的实例分配一个编号,以供以后查找。
我遇到的问题是它只会为所有循环实例的列分配一个数字,或者它几乎告诉我它不能并告诉我:
ValueError: Length of values (1) does not match length of index (2)
我只想在列中添加一个数字,这样我就可以知道该实例何时通过代码提取。
求助,我一直在用头撞墙
谢谢大家
这是我的代码:
states = ["Washington", "Oregon"]
period = "2020"
num_states = len(states)
state_list = []
i = 0
df = pd.DataFrame()
df[state] = i
for state in states:
x = state
driver = webdriver.Chrome(executable_path = 'C:/webdrivers/chromedriver.exe')
driver.get('https://www.nbc.gov/pilt/counties.cfm')
driver.implicitly_wait(20)
state_s = driver.find_element(By.NAME, 'state_code')
drp = Select(state_s)
drp.select_by_visible_text(state)
year_s = driver.find_element(By.NAME, 'fiscal_yr')
drp = Select(year_s)
drp.select_by_visible_text(period)
driver.implicitly_wait(10)
link = driver.find_element(By.NAME, 'Search')
link.click()
url = driver.current_url
page = requests.get(url)
#dfs = pd.read_html(addrss)[2]
# Get the html
soup = BeautifulSoup(page.text, 'lxml')
state_build = url.split('code=',1)[1]
state_id = state_build[:2]
table = soup.findAll('table')[2]
headers = []
for i in table.find_all('th'):
title = i.text.strip()
headers.append(title)
for row in table.find_all('tr')[1:]:
data = row.find_all('td')
row_data = [td.text.strip() for td in data]
length = len(df)
df = df.append(row_data)
dfs = df.set_index(df.groupby(level = 0)\
.cumcount(), append = True).stack()\
.unstack(0)\
.rename(columns={0 : 'COUNTY', 1: 'PRICE', 2: "TOTAL ACRES"})
dfs['STATE'] = i
time.sleep(5)
soup = BeautifulSoup(page.text, 'lxml')
table = soup.findAll('table')[1]
headers = []
i = i + 1
dfs
使用 i 以外的其他变量来显示循环增量,因为 i 也在循环中使用。比方说 r
但感谢您以后需要一个更好的名称。现在,您可以简单地将其添加到 row_data
,并更新 dfs =
行以拥有一个名为 LOOP
.
的额外重命名列
注意:我认为您可能想重新访问其余代码,看看是否可以进行一些简化,并将那些硬编码等待替换为基于 selenium 条件的等待。
r = 0
df = pd.DataFrame()
for state in states:
# ......
for row in table.find_all('tr')[1:]:
data = row.find_all('td')
row_data = [td.text.strip() for td in data]
row_data.append(r)
length = len(df)
df = df.append(row_data)
dfs = df.set_index(df.groupby(level = 0)\
.cumcount(), append = True).stack()\
.unstack(0)\
.rename(columns={0 : 'COUNTY', 1: 'PRICE', 2: "TOTAL ACRES", 3:"LOOP"})
# .....
r+=1
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import Select
import pandas as pd
import requests
from bs4 import BeautifulSoup
import time
states = ["Washington", "Oregon"]
period = "2020"
num_states = len(states)
state_list = []
r = 0
df = pd.DataFrame()
for state in states:
x = state
driver = webdriver.Chrome()#(executable_path = 'C:/webdrivers/chromedriver.exe')
driver.get('https://www.nbc.gov/pilt/counties.cfm')
driver.implicitly_wait(20)
state_s = driver.find_element(By.NAME, 'state_code')
drp = Select(state_s)
drp.select_by_visible_text(state)
year_s = driver.find_element(By.NAME, 'fiscal_yr')
drp = Select(year_s)
drp.select_by_visible_text(period)
driver.implicitly_wait(10)
link = driver.find_element(By.NAME, 'Search')
link.click()
url = driver.current_url
page = requests.get(url)
#dfs = pd.read_html(addrss)[2]
# Get the html
soup = BeautifulSoup(page.text, 'lxml')
state_build = url.split('code=',1)[1]
state_id = state_build[:2]
table = soup.findAll('table')[2]
headers = []
for i in table.find_all('th'): # you use i here!
title = i.text.strip()
headers.append(title)
for row in table.find_all('tr')[1:]:
data = row.find_all('td')
row_data = [td.text.strip() for td in data]
row_data.append(r)
length = len(df)
df = df.append(row_data)
dfs = df.set_index(df.groupby(level = 0)\
.cumcount(), append = True).stack()\
.unstack(0)\
.rename(columns={0 : 'COUNTY', 1: 'PRICE', 2: "TOTAL ACRES", 3:"LOOP"})
time.sleep(5)
soup = BeautifulSoup(page.text, 'lxml')
table = soup.findAll('table')[1]
headers = []
r+=1
dfs
这正是我使用 enumerate
的目的。所以离开 QHarr 的代码(所以接受他的解决方案,但我只是添加它......),你可以看到细微的差别(不需要设置 r=0
然后必须递增 r+=1
).
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import Select
import pandas as pd
import requests
from bs4 import BeautifulSoup
import time
states = ["Washington", "Oregon"]
period = "2020"
num_states = len(states)
state_list = []
df = pd.DataFrame()
for r, state in enumerate(states): #<- r will be the index position from the list `states` as it iterates through
x = state
driver = webdriver.Chrome()#(executable_path = 'C:/webdrivers/chromedriver.exe')
driver.get('https://www.nbc.gov/pilt/counties.cfm')
driver.implicitly_wait(20)
state_s = driver.find_element(By.NAME, 'state_code')
drp = Select(state_s)
drp.select_by_visible_text(state)
year_s = driver.find_element(By.NAME, 'fiscal_yr')
drp = Select(year_s)
drp.select_by_visible_text(period)
driver.implicitly_wait(10)
link = driver.find_element(By.NAME, 'Search')
link.click()
url = driver.current_url
page = requests.get(url)
#dfs = pd.read_html(addrss)[2]
# Get the html
soup = BeautifulSoup(page.text, 'lxml')
state_build = url.split('code=',1)[1]
state_id = state_build[:2]
table = soup.findAll('table')[2]
headers = []
for i in table.find_all('th'): # you use i here!
title = i.text.strip()
headers.append(title)
for row in table.find_all('tr')[1:]:
data = row.find_all('td')
row_data = [td.text.strip() for td in data]
row_data.append(r)
length = len(df)
df = df.append(row_data)
dfs = df.set_index(df.groupby(level = 0)\
.cumcount(), append = True).stack()\
.unstack(0)\
.rename(columns={0 : 'COUNTY', 1: 'PRICE', 2: "TOTAL ACRES", 3:"LOOP"})
time.sleep(5)
soup = BeautifulSoup(page.text, 'lxml')
table = soup.findAll('table')[1]
headers = []
dfs
HOWEVER...任何原因你没有在这里使用 pandas 而只是格式化 url?
import pandas as pd
states = ["WA", "OR"]
period = "2020"
dfs = []
for state in states:
df = pd.read_html('https://www.nbc.gov/pilt/counties.cfm?term=county&state_code={state}&fiscal_yr={period}'.format(state=state,period=period))[-1]
df['State'] = state
dfs.append(df)
df = pd.concat(dfs).reset_index(drop=True)
输出:
print(df)
COUNTY PAYMENT PAYMENT.1 PAYMENT.2 TOTAL ACRES State
0 ADAMS COUNTY ,408 ,408 ,408 21337 WA
1 ASOTIN COUNTY 4,550 4,550 4,550 71580 WA
2 BENTON COUNTY 1,659 1,659 1,659 64264 WA
3 CHELAN COUNTY ,244,827 ,244,827 ,244,827 1486918 WA
4 CLALLAM COUNTY ,101,485 ,101,485 ,101,485 523298 WA
.. ... ... ... ... ... ...
71 WASCO COUNTY ,973 ,973 ,973 220099 OR
72 WASHINGTON COUNTY ,545 ,545 ,545 13984 OR
73 WHEELER COUNTY 0,613 0,613 0,613 301762 OR
74 YAMHILL COUNTY ,627 ,627 ,627 58311 OR
75 TOTAL ,321,995 ,321,995 ,321,995 31312205 OR
我正在尝试为循环为每个实例收集数据的实例分配一个编号,以供以后查找。
我遇到的问题是它只会为所有循环实例的列分配一个数字,或者它几乎告诉我它不能并告诉我:
ValueError: Length of values (1) does not match length of index (2)
我只想在列中添加一个数字,这样我就可以知道该实例何时通过代码提取。
求助,我一直在用头撞墙
谢谢大家
这是我的代码:
states = ["Washington", "Oregon"]
period = "2020"
num_states = len(states)
state_list = []
i = 0
df = pd.DataFrame()
df[state] = i
for state in states:
x = state
driver = webdriver.Chrome(executable_path = 'C:/webdrivers/chromedriver.exe')
driver.get('https://www.nbc.gov/pilt/counties.cfm')
driver.implicitly_wait(20)
state_s = driver.find_element(By.NAME, 'state_code')
drp = Select(state_s)
drp.select_by_visible_text(state)
year_s = driver.find_element(By.NAME, 'fiscal_yr')
drp = Select(year_s)
drp.select_by_visible_text(period)
driver.implicitly_wait(10)
link = driver.find_element(By.NAME, 'Search')
link.click()
url = driver.current_url
page = requests.get(url)
#dfs = pd.read_html(addrss)[2]
# Get the html
soup = BeautifulSoup(page.text, 'lxml')
state_build = url.split('code=',1)[1]
state_id = state_build[:2]
table = soup.findAll('table')[2]
headers = []
for i in table.find_all('th'):
title = i.text.strip()
headers.append(title)
for row in table.find_all('tr')[1:]:
data = row.find_all('td')
row_data = [td.text.strip() for td in data]
length = len(df)
df = df.append(row_data)
dfs = df.set_index(df.groupby(level = 0)\
.cumcount(), append = True).stack()\
.unstack(0)\
.rename(columns={0 : 'COUNTY', 1: 'PRICE', 2: "TOTAL ACRES"})
dfs['STATE'] = i
time.sleep(5)
soup = BeautifulSoup(page.text, 'lxml')
table = soup.findAll('table')[1]
headers = []
i = i + 1
dfs
使用 i 以外的其他变量来显示循环增量,因为 i 也在循环中使用。比方说 r
但感谢您以后需要一个更好的名称。现在,您可以简单地将其添加到 row_data
,并更新 dfs =
行以拥有一个名为 LOOP
.
注意:我认为您可能想重新访问其余代码,看看是否可以进行一些简化,并将那些硬编码等待替换为基于 selenium 条件的等待。
r = 0
df = pd.DataFrame()
for state in states:
# ......
for row in table.find_all('tr')[1:]:
data = row.find_all('td')
row_data = [td.text.strip() for td in data]
row_data.append(r)
length = len(df)
df = df.append(row_data)
dfs = df.set_index(df.groupby(level = 0)\
.cumcount(), append = True).stack()\
.unstack(0)\
.rename(columns={0 : 'COUNTY', 1: 'PRICE', 2: "TOTAL ACRES", 3:"LOOP"})
# .....
r+=1
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import Select
import pandas as pd
import requests
from bs4 import BeautifulSoup
import time
states = ["Washington", "Oregon"]
period = "2020"
num_states = len(states)
state_list = []
r = 0
df = pd.DataFrame()
for state in states:
x = state
driver = webdriver.Chrome()#(executable_path = 'C:/webdrivers/chromedriver.exe')
driver.get('https://www.nbc.gov/pilt/counties.cfm')
driver.implicitly_wait(20)
state_s = driver.find_element(By.NAME, 'state_code')
drp = Select(state_s)
drp.select_by_visible_text(state)
year_s = driver.find_element(By.NAME, 'fiscal_yr')
drp = Select(year_s)
drp.select_by_visible_text(period)
driver.implicitly_wait(10)
link = driver.find_element(By.NAME, 'Search')
link.click()
url = driver.current_url
page = requests.get(url)
#dfs = pd.read_html(addrss)[2]
# Get the html
soup = BeautifulSoup(page.text, 'lxml')
state_build = url.split('code=',1)[1]
state_id = state_build[:2]
table = soup.findAll('table')[2]
headers = []
for i in table.find_all('th'): # you use i here!
title = i.text.strip()
headers.append(title)
for row in table.find_all('tr')[1:]:
data = row.find_all('td')
row_data = [td.text.strip() for td in data]
row_data.append(r)
length = len(df)
df = df.append(row_data)
dfs = df.set_index(df.groupby(level = 0)\
.cumcount(), append = True).stack()\
.unstack(0)\
.rename(columns={0 : 'COUNTY', 1: 'PRICE', 2: "TOTAL ACRES", 3:"LOOP"})
time.sleep(5)
soup = BeautifulSoup(page.text, 'lxml')
table = soup.findAll('table')[1]
headers = []
r+=1
dfs
这正是我使用 enumerate
的目的。所以离开 QHarr 的代码(所以接受他的解决方案,但我只是添加它......),你可以看到细微的差别(不需要设置 r=0
然后必须递增 r+=1
).
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import Select
import pandas as pd
import requests
from bs4 import BeautifulSoup
import time
states = ["Washington", "Oregon"]
period = "2020"
num_states = len(states)
state_list = []
df = pd.DataFrame()
for r, state in enumerate(states): #<- r will be the index position from the list `states` as it iterates through
x = state
driver = webdriver.Chrome()#(executable_path = 'C:/webdrivers/chromedriver.exe')
driver.get('https://www.nbc.gov/pilt/counties.cfm')
driver.implicitly_wait(20)
state_s = driver.find_element(By.NAME, 'state_code')
drp = Select(state_s)
drp.select_by_visible_text(state)
year_s = driver.find_element(By.NAME, 'fiscal_yr')
drp = Select(year_s)
drp.select_by_visible_text(period)
driver.implicitly_wait(10)
link = driver.find_element(By.NAME, 'Search')
link.click()
url = driver.current_url
page = requests.get(url)
#dfs = pd.read_html(addrss)[2]
# Get the html
soup = BeautifulSoup(page.text, 'lxml')
state_build = url.split('code=',1)[1]
state_id = state_build[:2]
table = soup.findAll('table')[2]
headers = []
for i in table.find_all('th'): # you use i here!
title = i.text.strip()
headers.append(title)
for row in table.find_all('tr')[1:]:
data = row.find_all('td')
row_data = [td.text.strip() for td in data]
row_data.append(r)
length = len(df)
df = df.append(row_data)
dfs = df.set_index(df.groupby(level = 0)\
.cumcount(), append = True).stack()\
.unstack(0)\
.rename(columns={0 : 'COUNTY', 1: 'PRICE', 2: "TOTAL ACRES", 3:"LOOP"})
time.sleep(5)
soup = BeautifulSoup(page.text, 'lxml')
table = soup.findAll('table')[1]
headers = []
dfs
HOWEVER...任何原因你没有在这里使用 pandas 而只是格式化 url?
import pandas as pd
states = ["WA", "OR"]
period = "2020"
dfs = []
for state in states:
df = pd.read_html('https://www.nbc.gov/pilt/counties.cfm?term=county&state_code={state}&fiscal_yr={period}'.format(state=state,period=period))[-1]
df['State'] = state
dfs.append(df)
df = pd.concat(dfs).reset_index(drop=True)
输出:
print(df)
COUNTY PAYMENT PAYMENT.1 PAYMENT.2 TOTAL ACRES State
0 ADAMS COUNTY ,408 ,408 ,408 21337 WA
1 ASOTIN COUNTY 4,550 4,550 4,550 71580 WA
2 BENTON COUNTY 1,659 1,659 1,659 64264 WA
3 CHELAN COUNTY ,244,827 ,244,827 ,244,827 1486918 WA
4 CLALLAM COUNTY ,101,485 ,101,485 ,101,485 523298 WA
.. ... ... ... ... ... ...
71 WASCO COUNTY ,973 ,973 ,973 220099 OR
72 WASHINGTON COUNTY ,545 ,545 ,545 13984 OR
73 WHEELER COUNTY 0,613 0,613 0,613 301762 OR
74 YAMHILL COUNTY ,627 ,627 ,627 58311 OR
75 TOTAL ,321,995 ,321,995 ,321,995 31312205 OR