我正试图从下面的网站抓取数据。这些表被分成不同的页面,但所有页面都使用相同的url。我使用Requests、pandas和BeautifulSoup来解析HTML代码,并且我能够只刮取初始表,但我需要所有表中的全部数据。
注意,我试图只使用熊猫、美丽的群像和请求。。。。注意:本网站显示特定时间的数据,页面范围从0到9,有时是10,15
我已经写了程序,但它只提取第一个表,我想从所有页面提取数据。请帮帮我
import requests
from bs4 import BeautifulSoup
import pandas as pd
params = { #Console
"pos": " ",
"stats": " ",
"type": " "
}
data = { # Network --> Request
"__EVENTTARGET" : "grdMWCG@ctl29", #grdMWCG$ctl29$ctl00
'__EVENTARGUMENT': "", #'FireCommand:grdMWCG$ctl29;PageSize;1000',
#'__VIEWSTATE' : "qI7kN0JsQhV3qLHXe45eUTDT1YiA+R/k6VJkLSCL++BAyoKgHIgYgPbr+q/NsFwQ5BHwqPkAeV25qNkOUOoz3vfYWD1d77cTyCM4sJKinGsyC9FfPkqSEPT5lEOhkP41a3Xo1GywCFbapgM83hXY5/Lu/RrQAjLNdEUwsg+dj/WHn+aE",
'__VIEWSTATEGENERATOR': "00145D45",
'__VIEWSTATEENCRYPTED':""
#'__EVENTVALIDATION': "41pWf5gO9W1jt4YKQK30WKaHlv3pGh+kaJAUpRk5MkyZqR/AauHgJ5YEFWg9DjwtQNxwXWSH6QbNwjgOjgyI2fkLOtSbFu0r9JuA+dXqtmvLUuIHECQv8HNH+8a1c179u6QvlBSsSMzIvpfJNPPu7ats04coyA3FHDghbZPDWqREQeq5"
}
def main(url):
with requests.Session() as req:
# r = req.get(url, params=params)
r = req.get(url)
soup = BeautifulSoup(r.content, 'html.parser')
data['__VIEWSTATE'] = soup.find("input", id="__VIEWSTATE").get("value")
data['__EVENTVALIDATION'] = soup.find(
"input", id="__EVENTVALIDATION").get("value")
#r = req.post(url, params=params, data=data)
r = req.post(url , data=data)
df = pd.read_html(r.content, attrs={
'id': 'grdMWCG'})[0] #Table id #grdMWCG
#df.drop(df.columns[3], axis=1, inplace=True)
print(df)
df.to_csv("ccil.csv", index=False)
main("https://www.ccilindia.com/OMMWCG.aspx")
因此,要从所有页面获取数据,您需要请求它们。
当你查看当你手动点击页面时调用的代码时,你会发现它会调用__doPostBack函数:
<a href="javascript:__doPostBack('grdMWSG$ctl29$ctl01','')" style="color:#35496A;">2</a>
此函数将在表单字段中输入值并提交表单。
function __doPostBack(eventTarget, eventArgument) {
if (!theForm.onsubmit || (theForm.onsubmit() != false)) {
theForm.__EVENTTARGET.value = eventTarget;
theForm.__EVENTARGUMENT.value = eventArgument;
theForm.submit();
}
}
页码存储在函数调用的值中:
grdMWSG$ctl29$ctl01 // page 2
您应该创建一个循环,遍历页数,然后发出请求并提取数据。
这段代码将请求第一页,提取页数,更新帖子字段,并从所有页面请求数据。
它只打印数据。如果要保存它,必须添加pd。打电话给你德尔>
import requests
from bs4 import BeautifulSoup
import pandas as pd
params = { #Console
"pos": " ",
"stats": " ",
"type": " "
}
data = {
"__EVENTTARGET" : "",
'__EVENTARGUMENT': '',
"__VIEWSTATE" : "",
"__VIEWSTATEGENERATOR": "",
"__VIEWSTATEENCRYPTED": "",
"__EVENTVALIDATION": "",
}
# Update the data Fields with new Values
# For fields that can't be found the exception can be ignored
def updateData(response):
global data
soup = BeautifulSoup(response.content, 'html.parser')
for i in data:
try:
data[i] = soup.find("input", id=i).get("value")
except:
pass
#print(i)
def main(url):
global data
targetString = "grdMWCG$ctl29$ctl0"
with requests.Session() as req:
r = req.get(url)
df = pd.read_html(r.content, attrs={
'id': 'grdMWCG'})[0]
# Print Table of First Page
print(df)
# get the last element of the last colum, where the current page count is stored
# The 26 is a magic number I don't know if it will always be 26
pageLength = int(df[0][26][-1])
updateData(r)
for pageNumber in range(1,pageLength):
data["__EVENTTARGET"] = targetString+str(pageNumber)
r = req.post(url , data=data)
_df = pd.read_html(r.content, attrs={
'id': 'grdMWCG'})[0]
updateData(r)
df = df.append(_df)
df.to_csv("ccild.csv", index=False)
main("https://www.ccilindia.com/OMMWCG.aspx")