运维开发网

scrapy爬取杰书网小说

运维开发网 https://www.qedev.com 2020-10-25 08:39 出处:51CTO 作者:wx57d7dc52ee7f8
scrapy抓取杰书网小说

'''

爬取网站 杰书网

网站地址 http://www.jieqishu.com

本脚本只为学习

'''

import requests

from bs4 import BeautifulSoup

import time,random

book_name = 'jieqishu' # 爬虫名

book_url = 'http://www.jieqishu.com' + '/' + book_name + '/' #拼接小说地址)

response = requests.get(url= book_url)

response.encoding = response.apparent_encoding #转码

soup = BeautifulSoup(response.text, features='html.parser')

a = soup.find(id='list')

dd_all = a.find_all('dd')

http_all = []

for i in dd_all:

http_all.append(book_url + i.find('a').attrs.get('href'))

http_all = http_all[8:] #从开头开始截取都为7章

m = 5 #测试限定爬取次数

with open(book_name+'.txt', 'w') as f:

n = 0 #计数

for i in http_all:

if m==n:break

h = requests.get(url=i)

h.encoding = h.apparent_encoding

hb = BeautifulSoup(h.text, features='html.parser')

tar_t = hb.find(id='content')

tar_h = hb.find("h1").text

f.write(tar_h+'\n')

for j in tar_t:

if str(j)!="<br/>":

f.write(str(j).lstrip()+'\n')

time.sleep(random.randint(3, 6))#增加爬取时间间隔,防止被封ip

n+=1

f.write('\n\n')

print('第%d章写入完成!'%n)

f.close()

扫码领视频副本.gif

0

精彩评论

暂无评论...
验证码 换一张
取 消

关注公众号