如何使用非头部网页抓取工具加载更多选项 [Instagram]
How to use load more option with a non head web scraper [Instagram]
我正在尝试使用 URL 抓取从 Instagram 下载位置详细信息,但我无法使用加载更多选项从 URL 抓取更多位置。
我很感激关于如何修改代码的建议,或者我需要使用哪个新代码块来获取特定 url 中所有可用位置的建议。
代码:
import re
import requests
import json
import pandas as pd
import numpy as np
import csv
from geopy.geocoders import Nominatim
def Location_city(F_name):
path="D:\Everyday_around_world\instagram\"
filename=path+F_name
url1="https://www.instagram.com/explore/locations/c1027234/hyderabad-india/"
r = requests.get(url1)
df3=pd.DataFrame()
match = re.search('window._sharedData = (.*);</script>', r.text)
a= json.loads(match.group(1))
b=a['entry_data']['LocationsDirectoryPage'][0]['location_list']
for j in range(0,len(b)):
z= b[j]
if all(ord(char) < 128 for char in z['name'])==True:
x=str(z['name'])
print (x)
geolocator = Nominatim()
location = geolocator.geocode(x,timeout=10000)
if location!=None:
#print((location.latitude, location.longitude))
df3 = df3.append(pd.DataFrame({'name': z['name'], 'id':z['id'],'latitude':location.latitude,
'longitude':location.longitude},index=[0]), ignore_index=True)
df3.to_csv(filename,header=True,index=False)
Location_city("Hyderabad_locations.csv")
在此先感谢您的帮助..
instagram "see more" 按钮的 url 我想你可能正在描述添加一个页码到你正在抓取的 url 像这样:https://www.instagram.com/explore/locations/c1027234/hyderabad-india/?page=2
您可以添加一个计数器来模拟增加页码并循环,只要您继续收到返回的结果。我添加了一个尝试,除了在没有更多结果时观察抛出的 KeyError,然后设置条件以退出循环并将数据帧写入 csv。
修改后的代码:
import re
import requests
import json
import pandas as pd
import numpy as np
import csv
from geopy.geocoders import Nominatim
def Location_city(F_name):
path="D:\Everyday_around_world\instagram\"
filename=path+F_name
url1="https://www.instagram.com/explore/locations/c1027234/hyderabad-india/?page="
pageNumber = 1
r = requests.get(url1+ str(pageNumber)) #grabs page 1
df3=pd.DataFrame()
searching = True
while searching:
match = re.search('window._sharedData = (.*);</script>', r.text)
a= json.loads(match.group(1))
try:
b=a['entry_data']['LocationsDirectoryPage'][0]['location_list']
except KeyError: #
print "No more locations returned"
searching = False # will exit while loop
b = [] # avoids duplicated from previous results
if len(b) > 0: # skips this section if there are no results
for j in range(0,len(b)):
z= b[j]
if all(ord(char) < 128 for char in z['name'])==True:
x=str(z['name'])
print (x)
geolocator = Nominatim()
location = geolocator.geocode(x,timeout=10000)
if location!=None:
#print((location.latitude, location.longitude))
df3 = df3.append(pd.DataFrame({'name': z['name'], 'id':z['id'],'latitude':location.latitude,
'longitude':location.longitude},index=[0]), ignore_index=True)
pageNumber += 1
next = url1 + str(pageNumber) # increments url
r = requests.get(next) # gets results for next url
df3.to_csv(filename,header=True,index=False) #When finished looping through pages, write dataframe to csv
Location_city("Hyderabad_locations.csv")
我正在尝试使用 URL 抓取从 Instagram 下载位置详细信息,但我无法使用加载更多选项从 URL 抓取更多位置。
我很感激关于如何修改代码的建议,或者我需要使用哪个新代码块来获取特定 url 中所有可用位置的建议。
代码:
import re
import requests
import json
import pandas as pd
import numpy as np
import csv
from geopy.geocoders import Nominatim
def Location_city(F_name):
path="D:\Everyday_around_world\instagram\"
filename=path+F_name
url1="https://www.instagram.com/explore/locations/c1027234/hyderabad-india/"
r = requests.get(url1)
df3=pd.DataFrame()
match = re.search('window._sharedData = (.*);</script>', r.text)
a= json.loads(match.group(1))
b=a['entry_data']['LocationsDirectoryPage'][0]['location_list']
for j in range(0,len(b)):
z= b[j]
if all(ord(char) < 128 for char in z['name'])==True:
x=str(z['name'])
print (x)
geolocator = Nominatim()
location = geolocator.geocode(x,timeout=10000)
if location!=None:
#print((location.latitude, location.longitude))
df3 = df3.append(pd.DataFrame({'name': z['name'], 'id':z['id'],'latitude':location.latitude,
'longitude':location.longitude},index=[0]), ignore_index=True)
df3.to_csv(filename,header=True,index=False)
Location_city("Hyderabad_locations.csv")
在此先感谢您的帮助..
instagram "see more" 按钮的 url 我想你可能正在描述添加一个页码到你正在抓取的 url 像这样:https://www.instagram.com/explore/locations/c1027234/hyderabad-india/?page=2
您可以添加一个计数器来模拟增加页码并循环,只要您继续收到返回的结果。我添加了一个尝试,除了在没有更多结果时观察抛出的 KeyError,然后设置条件以退出循环并将数据帧写入 csv。
修改后的代码:
import re
import requests
import json
import pandas as pd
import numpy as np
import csv
from geopy.geocoders import Nominatim
def Location_city(F_name):
path="D:\Everyday_around_world\instagram\"
filename=path+F_name
url1="https://www.instagram.com/explore/locations/c1027234/hyderabad-india/?page="
pageNumber = 1
r = requests.get(url1+ str(pageNumber)) #grabs page 1
df3=pd.DataFrame()
searching = True
while searching:
match = re.search('window._sharedData = (.*);</script>', r.text)
a= json.loads(match.group(1))
try:
b=a['entry_data']['LocationsDirectoryPage'][0]['location_list']
except KeyError: #
print "No more locations returned"
searching = False # will exit while loop
b = [] # avoids duplicated from previous results
if len(b) > 0: # skips this section if there are no results
for j in range(0,len(b)):
z= b[j]
if all(ord(char) < 128 for char in z['name'])==True:
x=str(z['name'])
print (x)
geolocator = Nominatim()
location = geolocator.geocode(x,timeout=10000)
if location!=None:
#print((location.latitude, location.longitude))
df3 = df3.append(pd.DataFrame({'name': z['name'], 'id':z['id'],'latitude':location.latitude,
'longitude':location.longitude},index=[0]), ignore_index=True)
pageNumber += 1
next = url1 + str(pageNumber) # increments url
r = requests.get(next) # gets results for next url
df3.to_csv(filename,header=True,index=False) #When finished looping through pages, write dataframe to csv
Location_city("Hyderabad_locations.csv")