使用pyqt webkit写headless 动态爬虫,需要捕获到所有的HTTP request和response(包括JS异步请求)。
QNetworkReply 没有返回content-length相关的方法,rawHeaderList实际上并不包括content-length。
一个可行的方法,是在readyRead信号触发时,size()方法获取到body的大小。示例代码如下,请注意self.reply_lst这个列表是必不可少的:
import sys
from PyQt5.Qt import QApplication
from PyQt5.QtCore import QUrl
from PyQt5.Qt import QWebView, QWebPage
from PyQt5.QtNetwork import QNetworkAccessManager
class Manager(QNetworkAccessManager):
def __init__(self, parent=None):
QNetworkAccessManager.__init__(self, parent)
self.finished.connect(self._finished)
self.reply_lst = []
def _finished(self, reply):
print '[%s bytes] %s' % (reply.content_length, reply.url().toString())
self.reply_lst.remove(reply)
def createRequest(self, operation, request, body=None):
_url = request.url()
reply = super(Manager, self).createRequest(operation, request, body)
reply.readyRead.connect(self.read_read)
self.reply_lst.append(reply)
return reply
def read_read(self):
self.sender().content_length = self.sender().size()
def app_quit():
app.quit()
if __name__ == "__main__":
app = QApplication(['-platform', 'minimal'])
browser = QWebView()
page = QWebPage()
manager = Manager()
page.setNetworkAccessManager(manager)
browser.setPage(page)
browser.load(QUrl('https://www.lijiejie.com/'))
browser.loadFinished.connect(app_quit)
sys.exit(app.exec_())
最后,拿到所有URL的content-length如下,可以把它写到爬虫的结果中:
[499 bytes] https://www.lijiejie.com/
[2082 bytes] https://s6.cnzz.com/stat.php?id=3804994&web_id=3804994&show=pic
[11414 bytes] https://www.lijiejie.com/wp-includes/js/wp-emoji-release.min.js?ver=4.7.1
[20172 bytes] https://www.lijiejie.com/wp-content/plugins/crayon-syntax-highlighter/css/min/crayon.min.css?ver=_2.7.2_beta
[2850 bytes] https://www.lijiejie.com/wp-content/plugins/crayon-syntax-highlighter/themes/github/github.css?ver=_2.7.2_beta
[86 bytes] https://www.lijiejie.com/wp-content/plugins/crayon-syntax-highlighter/fonts/courier-new.css?ver=_2.7.2_beta
[374 bytes] https://www.lijiejie.com/wp-content/plugins/wp-pagenavi/pagenavi-css.css?ver=2.70
[27597 bytes] https://www.lijiejie.com/wp-content/themes/retro-fitted/style.min.css?ver=0.4
[10056 bytes] https://www.lijiejie.com/wp-includes/js/jquery/jquery-migrate.min.js?ver=1.4.1
[3678 bytes] https://www.lijiejie.com/wp-content/plugins/dynamic-to-top/js/libs/jquery.easing.js?ver=1.3
[4214 bytes] https://www.lijiejie.com/wp-content/themes/retro-fitted/library/js/drop-downs.min.js?ver=20110920
[22337 bytes] https://www.lijiejie.com/wp-content/plugins/crayon-syntax-highlighter/js/min/crayon.min.js?ver=_2.7.2_beta
[1192 bytes] https://www.lijiejie.com/wp-content/plugins/dynamic-to-top/js/dynamic.to.top.min.js?ver=3.5
[1398 bytes] https://www.lijiejie.com/wp-includes/js/wp-embed.min.js?ver=4.7.1
[6108 bytes] https://www.lijiejie.com/wp-includes/js/jquery/jquery.js?ver=1.12.4
[765 bytes] https://c.cnzz.com/core.php?web_id=3804994&show=pic&t=z
[3197 bytes] https://www.lijiejie.com/wp-content/themes/retro-fitted/images/bg.jpg
[2236 bytes] https://www.lijiejie.com/wp-content/plugins/crayon-syntax-highlighter/css/images/toolbar/buttons.png
[576 bytes] https://www.lijiejie.com/wp-content/themes/retro-fitted/images/quotes.png
[9766 bytes] https://www.lijiejie.com/wp-content/themes/retro-fitted/images/header.png
[7210 bytes] https://www.lijiejie.com/wp-content/uploads/2014/10/stom.tencent.com_.png
[719 bytes] https://icon.cnzz.com/img/pic.gif
[43 bytes] https://hzs9.cnzz.com/stat.htm?id=3804994&r=&lg=zh-cn&ntime=none&cnzz_eid=1834513944-1484813777-&showp=1920x1080&t=李劼杰的博客&h=1&rnd=1792993998
[43 bytes] https://cnzz.mmstat.com/9.gif?abc=1&rnd=2124379566
[142 bytes] https://www.lijiejie.com/wp-content/themes/retro-fitted/images/bullet.png
[2516 bytes] https://www.lijiejie.com/wp-content/uploads/2017/01/eping.png
[3282 bytes] https://www.lijiejie.com/wp-content/uploads/2014/10/stom.tencent.com_upfile.png