Python | 一人之下漫画爬取并保存为pdf文件-FinClip官网

Python | 一人之下漫画爬取并保存为pdf文件

网友投稿 1223 2022-11-12

Python | 一人之下漫画爬取并保存为pdf文件

最近在看腾讯视频的一人之下4『陈朵篇』，但是这一季只有12集，且已经完结了，对陈朵仍旧充满好奇的我，耐不住下一季了，所以O(∩_∩)O哈哈~

最近在看腾讯视频的一人之下4『陈朵篇』，但是这一季只有12集，且已经完结了，对陈朵仍旧充满好奇的我，耐不住下一季了，所以嘻嘻

本文主人公：

36漫画网

因为这个网站的反爬措施做得还OK，值得表扬，所以我就不一一讲解了，因为这是一个非常简单的爬虫流程，图片还是懒加载，很容易找到。

直接上代码了：

代码运行的时候，可能会报错误requests.exceptions.SSLError: HTTPSConnectionPool(host='XXX', port=443)解决python爬虫requests.exceptions.SSLError: HTTPSConnectionPool(host='XXX', port=443)问题

为了解决这一问题，同时也为了没必要-全部章节的需要，我就重整了下代码。用法：输入1，则-1-10话，输入2，则-11-20话，以此类推...... 就每10话为一PDF，不需要下周全部章节了哈哈。

from requests.packages.urllib3.exceptions import InsecureRequestWarningfrom reportlab.lib.pagesizes import portraitfrom reportlab.pdfgen import canvasfrom pyquery import PyQueryfrom PIL import Imageimport requestsimport shutilimport timeimport globimport osrequests.packages.urllib3.disable_warnings(InsecureRequestWarning)headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.45 Safari/537.36',}def get_chapter(url): """获取每章链接""" html = requests.get(url, headers=headers) html.encoding = 'utf-8' folder_path = '\\'.join([os.getcwd(), PyQuery(html.text)('title').text().split('_')[0]]) if not os.path.exists(folder_path): os.mkdir(folder_path) chapters = [[a.text(), a.attr('href')] for a in PyQuery(html.text)('#chapter-list-4 li a').items() if a.text().split('.')[0].isdigit() or a.text()[0].isdigit()] chapters.reverse() return folder_path, chaptersdef get_pic_linking(path_chapters): """获取图片链接""" folder_path, chapters = path_chapters pics_linking = [] for name, chapter in chapters: html = requests.get(chapter, headers=headers) html.encoding = 'utf-8' pic_linking = [pic_url.attr('src') for pic_url in PyQuery(html.text)('div > mip-link mip-img').items()] pics_linking += pic_linking if not os.path.exists(folder_path): os.mkdir(folder_path) try: img_download(folder_path, pics_linking) except: print("出错了，请重新尝试o(╥﹏╥)o") shutil.rmtree(folder_path)def img_download(path, pics): """-图片""" num = 1 row = list(range(1, 30)) print(f"开始- >>> {os.path.split(path)[1]} >> 共{len(pics)}张") for pic in pics: print(num, end=' ') if num//30 in row: print() row.pop(0) try: with open('\\'.join([path, str(num) + '.jpg']), 'wb') as f: f.write(requests.get(pic, verify=False).content) except Exception as e: print("出现错误！请耐心等待5s！") time.sleep(5) with open('\\'.join([path, str(num) + '.jpg']), 'wb') as f: f.write(requests.get(pic, verify=False).content) num += 1 jpg_to_pdf(path) shutil.rmtree(path)def jpg_to_pdf(path): """生成PDF文件""" print(f"\n--->>> 正在图片转pdf文件文件路径{path}.pdf") jpg_path = glob.glob(f"{path}\*.jpg") jpg_path.sort(key=lambda x: int(os.path.basename(x).split('.')[0])) w, h = Image.open(jpg_path[0]).size ca = canvas.Canvas(path + '.pdf', pagesize=portrait((w, h))) for jpg in jpg_path: ca.drawImage(jpg, 0, 0, w, h) ca.showPage() ca.save()def select_section(section, chapters): """选择-范围""" sec = int(section) name = f'{(sec - 1) * 10+1}-{sec * 10}' if sec * 10 > len(chapters[1])+14: print(f"漫画一共才更到{len(chapters[1])+4}话，你想-{(sec-1)*10+1}-{sec*10},你有毛病吧！") exit() if sec < 43: chapter = chapters[1][(sec - 1) * 10:sec * 10] elif sec == 43: chapter = chapters[1][(sec - 1) * 10:sec * 10 - 4] print("注意，缺少425-428话！") elif sec*10 < len(chapters[1])+4: chapter = chapters[1][(sec - 1) * 10 - 4:sec * 10 - 4] else: print(f"漫画一共才更到{len(chapters[1])+4}话，所以只能-{(sec-1)*10+1}-{len(chapters[1])+4} o(╥﹏╥)o") chapter = chapters[1][(sec-1)*10-4:] name = f"{(sec-1)*10+1}-{len(chapters[1])+4}" return chapters[0]+f"\\{name}章", chapterdef main(): _url = ' print("输入1，则-1-10话，输入2，则-11-20话，以此类推......") _section = input("请输入指定数字：") _chapter = get_chapter(_url) _chapters = select_section(_section, _chapter) get_pic_linking(_chapters)if __name__ == '__main__': main()

因为这个网站，少了425-428的章节，见下图

所以使用了一个函数做判断（若网站以后更新有了这些章节，小伙伴们可自行更改喔，或者私信给我哈）：

def select_section(section, chapters): """选择-范围""" sec = int(section) name = f'{(sec - 1) * 10+1}-{sec * 10}' if sec * 10 > len(chapters[1])+14: print(f"漫画一共才更到{len(chapters[1])+4}话，你想-{(sec-1)*10+1}-{sec*10},你有毛病吧！") exit() if sec < 43: chapter = chapters[1][(sec - 1) * 10:sec * 10] elif sec == 43: chapter = chapters[1][(sec - 1) * 10:sec * 10 - 4] print("注意，缺少425-428话！") elif sec*10 < len(chapters[1])+4: chapter = chapters[1][(sec - 1) * 10 - 4:sec * 10 - 4] else: print(f"漫画一共才更到{len(chapters[1])+4}话，所以只能-{(sec-1)*10+1}-{len(chapters[1])+4} o(╥﹏╥)o") chapter = chapters[1][(sec-1)*10-4:] name = f"{(sec-1)*10+1}-{len(chapters[1])+4}" return chapters[0]+f"\\{name}章", chapter

进度条展示

from requests.packages.urllib3.exceptions import InsecureRequestWarningfrom reportlab.lib.pagesizes import portraitfrom reportlab.pdfgen import canvasfrom pyquery import PyQueryfrom PIL import Imageimport requestsimport shutilimport timeimport globimport osrequests.packages.urllib3.disable_warnings(InsecureRequestWarning)headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.45 Safari/537.36',}def get_chapter(url): """获取每章链接""" html = requests.get(url, headers=headers) html.encoding = 'utf-8' folder_path = '\\'.join([os.getcwd(), PyQuery(html.text)('title').text().split('_')[0]]) if not os.path.exists(folder_path): os.mkdir(folder_path) chapters = [[a.text(), a.attr('href')] for a in PyQuery(html.text)('#chapter-list-4 li a').items() if a.text().split('.')[0].isdigit() or a.text()[0].isdigit()] chapters.reverse() return folder_path, chaptersdef get_pic_linking(path_chapters): """获取图片链接""" folder_path, chapters = path_chapters pics_linking = [] for name, chapter in chapters: html = requests.get(chapter, headers=headers) html.encoding = 'utf-8' pic_linking = [pic_url.attr('src') for pic_url in PyQuery(html.text)('div > mip-link mip-img').items()] pics_linking += pic_linking if not os.path.exists(folder_path): os.mkdir(folder_path) try: img_download(folder_path, pics_linking) except Exception as e: print(e) print("出错了，请重新尝试o(╥﹏╥)o") shutil.rmtree(folder_path)def img_download(path, pics): """-图片""" print(f"开始- >>> {os.path.split(path)[1]} >> 共{len(pics)}张") for num, pic in enumerate(pics): print(f'\r{"▇" * ((num + 1) // 2)} {(num + 1) / len(pics) * 100:.0f}%', end='') try: with open('\\'.join([path, str(num + 1) + '.jpg']), 'wb') as f: f.write(requests.get(pic, verify=False).content) except Exception as e: time.sleep(5) with open('\\'.join([path, str(num + 1) + '.jpg']), 'wb') as f: f.write(requests.get(pic, verify=False).content) jpg_to_pdf(path) shutil.rmtree(path)def jpg_to_pdf(path): """生成PDF文件""" print(f"\n--->>> 正在图片转pdf文件文件路径{path}.pdf") jpg_path = glob.glob(f"{path}\*.jpg") jpg_path.sort(key=lambda x: int(os.path.basename(x).split('.')[0])) w, h = Image.open(jpg_path[0]).size ca = canvas.Canvas(path + '.pdf', pagesize=portrait((w, h))) for jpg in jpg_path: ca.drawImage(jpg, 0, 0, w, h) ca.showPage() ca.save()def select_section(section, chapters): """选择-范围""" sec = int(section) name = f'{(sec - 1) * 10 + 1}-{sec * 10}' if sec * 10 > len(chapters[1]) + 14: print(f"漫画一共才更到{len(chapters[1]) + 4}话，你想-{(sec - 1) * 10 + 1}-{sec * 10},你有毛病吧！") exit() if sec < 43: chapter = chapters[1][(sec - 1) * 10:sec * 10] elif sec == 43: chapter = chapters[1][(sec - 1) * 10:sec * 10 - 4] print("注意，缺少425-428话！") elif sec * 10 < len(chapters[1]) + 4: chapter = chapters[1][(sec - 1) * 10 - 4:sec * 10 - 4] else: print(f"漫画一共才更到{len(chapters[1]) + 4}话，所以只能-{(sec - 1) * 10 + 1}-{len(chapters[1]) + 4} o(╥﹏╥)o") chapter = chapters[1][(sec - 1) * 10 - 4:] name = f"{(sec - 1) * 10 + 1}-{len(chapters[1]) + 4}" return chapters[0] + f"\\{name}章", chapterdef main(): _url = ' print("输入1，则-1-10话，输入2，则-11-20话，以此类推......") _section = input("请输入指定数字：") _chapter = get_chapter(_url) _chapters = select_section(_section, _chapter) get_pic_linking(_chapters)if __name__ == '__main__': main()

我用的是PyCharm运行的，貌似用自带的IDLE不可以,怪怪的

箴言：因为这些东西是非常简单的。不要抱怨自己学不会，那是因为你没有足够用心。

大屏前端框架在数据可视化与用户交互中的重要性与应用前景

1223 2022-11-12

Python | 一人之下漫画爬取并保存为pdf文件

国产操作系统生态圈推动信息安全与技术自主发展的新机遇

React 前端框架助力企业快速适应数字化转型的挑战与机遇

大屏前端框架在数据可视化与用户交互中的重要性与应用前景

最近发表

更多内容

小程序SDK

Finclip技术文档

小程序开发

小程序容器

小程序框架

Finclip小程序平台

Finclip用户投稿

车联网

推荐文章

小程序SDK是什么意思？小程序sdk和插件有什么区别？

小程序支付功能怎么实现？

企业app开发流程是什么？

app运营模式有哪些？

小程序多端引流怎么做？

小程序生态分析的机会和威胁

Flutter入门这一篇效率文章就够了

原生与跨平台解决方案分析,跨平台软件开发技术方案

热更新技术：让软件更新变得更加轻松快速

解决方案

银行解决方案

证券解决方案

互联网解决方案

政企OA解决方案

科技解决方案

loT解决方案

信任解决方案

热评文章

AppCan:基于混合模式的移动应用开发,移动混合模

Hybrid App混合模式开发的了解

小程序容器技术助力券商数字营销突围，小程序容器化的意

用mpvue开发微信小程序基础知识（vue.js开发

小程序多端框架全面测评对比，强烈推荐！

券商app架构 - 解析券商应用程序的构建与设计