import feapder
import re
import json
from nodejs.bindings import node_run
class DoubanBookSpider(feapder.AirSpider):
def start_requests(self):
with open(r'/Users/yangxin/Desktop/douban_book/keyword.txt', 'r') as f:
for key in f.readlines():
urls = 'https://search.douban.com/book/subject_search?search_text={}&cat=1001'.format(key)
print("本次抓取" + key)
yield feapder.Request(urls)
def parse(self, request, response):
data = re.search('window.__DATA__ = "([^"]+)"', response.text).group(1) # 加密的数据
stderr, stdout = node_run(f"/Users/yangxin/Desktop/douban_book/spiders/main.js", data)
data_dict = json.loads(stdout)
#print(json.dumps(data_dict, indent=4, ensure_ascii=False))
for i in data_dict:
url_detail = i['url']
#print(i['url'])
yield feapder.Request(
url_detail, callback=self.parse_detail
)
def parse_detail(self, request, response):
"""
解析详情
"""
# 取url
url = request.url
# 取title
# 解析正文
content = response.xpath('//*[@id="wrapper"]/h1/span/text()').extract_first() #书名
print("url", url)
print("content", content)
if __name__ == "__main__":
DoubanBookSpider().start()
来源:https://github.com/SergioJune/Spider-Crack-JS