提问者:小点点

在Python中使用post请求从下一页获取数据时出现问题


我正试图从下面的网站抓取数据。这些表被分成不同的页面,但所有页面都使用相同的url。我使用Requests、pandas和BeautifulSoup来解析HTML代码,并且我能够只刮取初始表,但我需要所有表中的全部数据。

注意,我试图只使用熊猫、美丽的群像和请求。。。。注意:本网站显示特定时间的数据,页面范围从0到9,有时是10,15

我已经写了程序,但它只提取第一个表,我想从所有页面提取数据。请帮帮我

import requests
from bs4 import BeautifulSoup
import pandas as pd

params = {                   #Console
    "pos": " ",
    "stats": " ",
    "type": " "
}

data = {                                # Network --> Request 
    "__EVENTTARGET" : "grdMWCG@ctl29",   #grdMWCG$ctl29$ctl00
    '__EVENTARGUMENT': "",    #'FireCommand:grdMWCG$ctl29;PageSize;1000',
   #'__VIEWSTATE' : "qI7kN0JsQhV3qLHXe45eUTDT1YiA+R/k6VJkLSCL++BAyoKgHIgYgPbr+q/NsFwQ5BHwqPkAeV25qNkOUOoz3vfYWD1d77cTyCM4sJKinGsyC9FfPkqSEPT5lEOhkP41a3Xo1GywCFbapgM83hXY5/Lu/RrQAjLNdEUwsg+dj/WHn+aE",
    '__VIEWSTATEGENERATOR': "00145D45",
    '__VIEWSTATEENCRYPTED':""
   #'__EVENTVALIDATION': "41pWf5gO9W1jt4YKQK30WKaHlv3pGh+kaJAUpRk5MkyZqR/AauHgJ5YEFWg9DjwtQNxwXWSH6QbNwjgOjgyI2fkLOtSbFu0r9JuA+dXqtmvLUuIHECQv8HNH+8a1c179u6QvlBSsSMzIvpfJNPPu7ats04coyA3FHDghbZPDWqREQeq5"
}

def main(url):
    with requests.Session() as req:
      # r = req.get(url, params=params)
        r = req.get(url)
        soup = BeautifulSoup(r.content, 'html.parser')
        data['__VIEWSTATE'] = soup.find("input", id="__VIEWSTATE").get("value")
        data['__EVENTVALIDATION'] = soup.find(
            "input", id="__EVENTVALIDATION").get("value")
        #r = req.post(url, params=params, data=data)
        r = req.post(url ,  data=data)
        df = pd.read_html(r.content, attrs={
                          'id': 'grdMWCG'})[0]          #Table id  #grdMWCG
        #df.drop(df.columns[3], axis=1, inplace=True)
        print(df)
        df.to_csv("ccil.csv", index=False)


main("https://www.ccilindia.com/OMMWCG.aspx")


共1个答案

匿名用户

因此,要从所有页面获取数据,您需要请求它们。

当你查看当你手动点击页面时调用的代码时,你会发现它会调用__doPostBack函数:

<a href="javascript:__doPostBack('grdMWSG$ctl29$ctl01','')" style="color:#35496A;">2</a>

此函数将在表单字段中输入值并提交表单。

function __doPostBack(eventTarget, eventArgument) {
    if (!theForm.onsubmit || (theForm.onsubmit() != false)) {
        theForm.__EVENTTARGET.value = eventTarget;
        theForm.__EVENTARGUMENT.value = eventArgument;
        theForm.submit();
    }
}

页码存储在函数调用的值中:

grdMWSG$ctl29$ctl01 // page 2

您应该创建一个循环,遍历页数,然后发出请求并提取数据。

这段代码将请求第一页,提取页数,更新帖子字段,并从所有页面请求数据。

它只打印数据。如果要保存它,必须添加pd。打电话给你

import requests
from bs4 import BeautifulSoup
import pandas as pd

params = {                   #Console
    "pos": " ",
    "stats": " ",
    "type": " "
}

data = {                             
    "__EVENTTARGET" : "",
    '__EVENTARGUMENT': '',  
    "__VIEWSTATE" : "",
    "__VIEWSTATEGENERATOR": "",
    "__VIEWSTATEENCRYPTED": "",
    "__EVENTVALIDATION": "",
    
    
}


# Update the data Fields with new Values 
# For fields that can't be found the exception can be ignored 
def updateData(response):
    global data 
    soup = BeautifulSoup(response.content, 'html.parser')
    for i in data: 
        try:
            data[i] = soup.find("input", id=i).get("value")
        except:
             pass
            #print(i)

def main(url):
    global data 
    targetString = "grdMWCG$ctl29$ctl0"
    with requests.Session() as req:
        r = req.get(url)
        df = pd.read_html(r.content, attrs={
                      'id': 'grdMWCG'})[0]          

        # Print Table of First Page 
        print(df)
        
        # get the last element of the last colum, where the current page count is stored 
        # The 26 is a magic number I don't know if it will always be 26
        pageLength = int(df[0][26][-1])
        
        updateData(r)
        
        for pageNumber in range(1,pageLength):
            data["__EVENTTARGET"] = targetString+str(pageNumber)
            r = req.post(url ,  data=data)
            _df = pd.read_html(r.content, attrs={
                          'id': 'grdMWCG'})[0]          
            updateData(r)        
            df = df.append(_df)
        df.to_csv("ccild.csv", index=False)


main("https://www.ccilindia.com/OMMWCG.aspx")