豆瓣读书和音乐搜索解密 window.__DATA__

python · 2023-07-22
豆瓣读书和音乐搜索解密 window.__DATA__
import feapder
import re
import json
from nodejs.bindings import node_run



class DoubanBookSpider(feapder.AirSpider):

    def start_requests(self):
        with open(r'/Users/yangxin/Desktop/douban_book/keyword.txt', 'r') as f:
            for key in f.readlines():
                urls = 'https://search.douban.com/book/subject_search?search_text={}&cat=1001'.format(key)
                print("本次抓取" + key)
            yield feapder.Request(urls)


    def parse(self, request, response):
        data = re.search('window.__DATA__ = "([^"]+)"', response.text).group(1)  # 加密的数据
        stderr, stdout = node_run(f"/Users/yangxin/Desktop/douban_book/spiders/main.js", data)
        data_dict = json.loads(stdout)
        #print(json.dumps(data_dict, indent=4, ensure_ascii=False))
        for i in data_dict:
            url_detail = i['url']
            #print(i['url'])

            yield feapder.Request(
                url_detail, callback=self.parse_detail
            )

    def parse_detail(self, request, response):
        """
        解析详情
        """
        # 取url
        url = request.url
        # 取title

        # 解析正文
        content = response.xpath('//*[@id="wrapper"]/h1/span/text()').extract_first() #书名

        print("url", url)
        print("content", content)


if __name__ == "__main__":
    DoubanBookSpider().start()

来源:https://github.com/SergioJune/Spider-Crack-JS

豆瓣爬虫
Theme Jasmine by Kent Liao