用Python爬了些起点中文网的小说看看-duidaima 堆代码

用Python爬了些起点中文网的小说看看

发布于 2个月前
 611 热度

 0 评论

余归
1 粉丝 32 篇博客

前言

最近在学习Python爬虫技术，对于我这个初学者来说如何使用Python实现一个爬虫的念想一直萦绕在我脑海，于是说干就干，今天就试着使用Python来爬取起点中文网上的一些小说来看一看。

Python爬虫代码如下：

# -*- coding: utf-8 -*-

 import time
import datetime
import threadpool
from bs4 import BeautifulSoup
import csv
import requests
from urllib.parse import urlencode
from retry import retry
 
def get_unix_time():  # 获取unix时间戳
    dtime = datetime.datetime.now()
    ans_time = int(time.mktime(dtime.timetuple()))
    return ans_time
 
def init():
    row = ['book_name', 'author', 'words_count', 'click_count', 'books_count', 'score', 'j_user_count','crawl_time','id']
    #row = ['小说名', '作者', '字数', '点击量', '作品个数', '评分', '评价人数', '抓取时间', 'url']
    with open("qidian.csv", "w", newline="") as f:
        f = csv.writer(f, dialect="excel")
        f.writerow(row)
 
def work(url, count=0):
    page = requests.get(url)
    page.encoding = "utf-8"
    soup = BeautifulSoup(page.text, 'lxml')
    try:
        # 选择元素
        elem = soup.select(".book-info h1 em")
        book_name = elem[0].text
        author = soup.select(".writer")[0].text
        words_count = soup.select(".book-info p em")[0].text
        click_count = soup.select(".book-info p em")[1].text
        books_count = soup.select(".work-state li em")[0].text
        id = url.replace("https://book.qidian.com/info/", "")
        crawl_time=get_unix_time()
        print(url)
        # score = soup.select("#score1")[0].text + '.' + soup.select("#score2")[0].text
        # j_user_count = soup.select("#j_userCount span")[0].text
        bookid = id
        data = {
            '_csrfToken': 'QpbsVhyc5zc0h21NiEweIrLMu2tFOM1RsgfZtWSS',
            'bookId': bookid,
            'pageSize': 15
        }
        other_url = 'https://book.qidian.com/ajax/comment/index?' + urlencode(data)
        page = requests.get(other_url, stream=True)
        page.encoding = "utf-8"
        cont = eval(page.text)
        score = cont.get('data').get('rate')
        j_user_count = cont.get('data').get('userCount')
        # 写：追加
        row = [book_name, author, words_count, click_count, books_count, score, j_user_count, crawl_time, id]
        with open("qidian.csv", "a", encoding="utf-8",newline='') as f:
            f = csv.writer(f, dialect="excel")
            f.writerow(row)
        with open("doneurl.txt", "a", newline='',encoding='utf-8') as fe:
            fe.write(url + '\n')
        fe.close()
    except BaseException:
        if count < 5:
            print('errror 元素获取失败 重试次数：' + str(count))
            time.sleep(2)
            work(url, count+1)
        else:
            with open("error_url.txt", "a", encoding='utf-8') as fe:
                fe.write(url + '\n')
                print('errror 元素获取失败 写入文件')
            fe.close()
 
#定义采集单页函数
# @retry(tries=5, delay=1)
# def load(i):
#     url="https://www.qidian.com/all?page="+str(i)
#     print("正在采集页面:{}".format(url))
#     page=requests.get(url)
#     page.encoding="utf-8"
#     soup = BeautifulSoup(page.text, 'lxml')
#     elem=soup.select(".book-mid-info h4 a")#选取url
#     urls=[]
#     for j in range(0,20):
#         url = 'https:' + elem[j].get('href')
#         urls.append(url)
#     if len(urls)!=20:
#         raise Exception(BaseException, i)
#     with open('urls.txt', 'a', encoding='utf-8') as f:#写入文件
#         for cont in urls:
#             f.write(str(cont)+'\n')def load(i,count=0):
    try:
        url="https://www.qidian.com/all?page="+str(i)
        print("正在采集页面:{}".format(url))
        page=requests.get(url)
        page.encoding="utf-8"
        soup = BeautifulSoup(page.text, 'lxml')
        elem=soup.select(".book-mid-info h4 a")#选取url
        urls=[]
        for j in range(0,20):
            url = 'https:' + elem[j].get('href')
            urls.append(url)
        if len(urls)!=20:
            raise Exception(BaseException, i)
        with open('urls.txt', 'a', encoding='utf-8') as f:#写入文件
            for cont in urls:
                f.write(str(cont)+'\n')
    except BaseException as e:
        if count<5:
            load(i,count+1)
        else:
            print(str(e))
            with open('urllist.txt','a',encoding='utf-8') as fp:
                fp.write(url+' '+i+'\n')
 
def loadurl(start,end,thrednum):
    links = []
    for i in range(start,end+1):#自定义页数
        links.append(i)
    #开始采集小说url
    print(len(links))
    try:
        pool = threadpool.ThreadPool(thrednum)  # 线程池
        requests = threadpool.makeRequests(load, links)
        [pool.putRequest(req) for req in requests]
        pool.wait()
    except KeyboardInterrupt:
        print('手动暂停')
 
def spider(start=1,end=2500,thrednum=10):  #输入文件输出文件
    #采集每本小说url储存到文件
    loadurl(start,end,thrednum)
    #将url读取到list
    with open('urls.txt', 'r+', encoding='utf-8') as f:
        links = []
        url = f.readline().strip('\n')
        while url:
            links.append(url)
            url = f.readline().strip('\n')
        #links=links[0:2500]
    #开始采集每条记录
    init()
    try:
        pool = threadpool.ThreadPool(thrednum)  # 线程池
        requests = threadpool.makeRequests(work, links)
        [pool.putRequest(req) for req in requests]
        pool.wait()
    except KeyboardInterrupt:
        print('手动暂停')spider(1,2500,50)

总结：

使用Python做爬虫说简单也简单，说复杂也有点复杂，里面的关键还是要理清楚要爬取页面的url规则构造，然后才能模拟点击请求实现页面的爬取。以上爬虫代码只是简单学习练手之作，还有很多功能不够完善的地方，后续考虑再如何完善一下。

 用户评论

Python编程
 140 成员 |  319 话题
+我要提问 +随便写写

可能感兴趣的话题

Python 类不要再写 __init__ 方法了

PEP-750 t-string 语法被正式采纳它与f-string究竟有何不同？

请教一个关于 Python库的接口设计问题

如何在Python中使用textcase库处理文本格式(大小写，驼峰命名批量转换)