国产操作系统生态圈推动信息安全与技术自主发展的新机遇
1223
2022-11-12
Python | 一人之下漫画爬取并保存为pdf文件
最近在看腾讯视频的一人之下4『陈朵篇』,但是这一季只有12集,且已经完结了,对陈朵仍旧充满好奇的我,耐不住下一季了,所以O(∩_∩)O哈哈~
最近在看腾讯视频的一人之下4『陈朵篇』,但是这一季只有12集,且已经完结了,对陈朵仍旧充满好奇的我,耐不住下一季了,所以嘻嘻
本文主人公:
36漫画网
因为这个网站的反爬措施做得还OK,值得表扬,所以我就不一一讲解了,因为这是一个非常简单的爬虫流程,图片还是懒加载,很容易找到。
直接上代码了:
from requests.packages.urllib3.exceptions import InsecureRequestWarningfrom reportlab.lib.pagesizes import portraitfrom reportlab.pdfgen import canvasfrom pyquery import PyQueryfrom PIL import Imageimport requestsimport timeimport globimport osrequests.packages.urllib3.disable_warnings(InsecureRequestWarning)headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.45 Safari/537.36',}def get_chapter(url): """获取每章的url链接""" html = requests.get(url, headers=headers) html.encoding = 'utf-8' folder_path = '\\'.join([os.getcwd(), PyQuery(html.text)('title').text().split('_')[0]]) if not os.path.exists(folder_path): os.mkdir(folder_path) chapters = [[a.text(), a.attr('href')] for a in PyQuery(html.text)('#chapter-list-4 li a').items() if a.text().split('.')[0].isdigit() or a.text()[0].isdigit()] chapters.reverse() return folder_path, chaptersdef get_pic_linking(path_chapters): """获取每章的图片链接""" path, chapters = path_chapters for name, chapter in chapters: html = requests.get(chapter, headers=headers) html.encoding = 'utf-8' pic_linking = [pic_url.attr('src') for pic_url in PyQuery(html.text)('div > mip-link mip-img').items()] folder_path = '\\'.join([path, name]) if not os.path.exists(folder_path): os.mkdir(folder_path) img_download(folder_path, pic_linking)def img_download(path, pics): """-图片""" num = 1 print(f"开始- >>> {os.path.split(path)[1]} >> 共{len(pics)}张") for pic in pics: print(num, end=' ') try: with open('\\'.join([path, str(num) + '.jpg']), 'wb') as f: f.write(requests.get(pic, verify=False).content) except: print("出现错误!请等候5s...") time.sleep(5) with open('\\'.join([path, str(num) + '.jpg']), 'wb') as f: f.write(requests.get(pic, verify=False).content) num += 1 jpg_to_pdf(path)def jpg_to_pdf(path): """生成PDF文件""" print(f"--->>> 正在图片转pdf文件 文件路径{path}.pdf") jpg_path = glob.glob(f"{path}\*.jpg") jpg_path.sort(key=lambda x: int(os.path.basename(x).split('.')[0])) w, h = Image.open(jpg_path[0]).size ca = canvas.Canvas(path + '.pdf', pagesize=portrait((w, h))) for jpg in jpg_path: ca.drawImage(jpg, 0, 0, w, h) ca.showPage() ca.save()def main(): _url = ' _chapter = get_chapter(_url) get_pic_linking(_chapter)if __name__ == '__main__': main()
代码运行的时候,可能会报错误requests.exceptions.SSLError: HTTPSConnectionPool(host='XXX', port=443)解决python爬虫requests.exceptions.SSLError: HTTPSConnectionPool(host='XXX', port=443)问题
为了解决这一问题,同时也为了没必要-全部章节的需要,我就重整了下代码。 用法:输入1,则-1-10话,输入2,则-11-20话,以此类推...... 就每10话为一PDF,不需要下周全部章节了哈哈。
from requests.packages.urllib3.exceptions import InsecureRequestWarningfrom reportlab.lib.pagesizes import portraitfrom reportlab.pdfgen import canvasfrom pyquery import PyQueryfrom PIL import Imageimport requestsimport shutilimport timeimport globimport osrequests.packages.urllib3.disable_warnings(InsecureRequestWarning)headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.45 Safari/537.36',}def get_chapter(url): """获取每章链接""" html = requests.get(url, headers=headers) html.encoding = 'utf-8' folder_path = '\\'.join([os.getcwd(), PyQuery(html.text)('title').text().split('_')[0]]) if not os.path.exists(folder_path): os.mkdir(folder_path) chapters = [[a.text(), a.attr('href')] for a in PyQuery(html.text)('#chapter-list-4 li a').items() if a.text().split('.')[0].isdigit() or a.text()[0].isdigit()] chapters.reverse() return folder_path, chaptersdef get_pic_linking(path_chapters): """获取图片链接""" folder_path, chapters = path_chapters pics_linking = [] for name, chapter in chapters: html = requests.get(chapter, headers=headers) html.encoding = 'utf-8' pic_linking = [pic_url.attr('src') for pic_url in PyQuery(html.text)('div > mip-link mip-img').items()] pics_linking += pic_linking if not os.path.exists(folder_path): os.mkdir(folder_path) try: img_download(folder_path, pics_linking) except: print("出错了,请重新尝试o(╥﹏╥)o") shutil.rmtree(folder_path)def img_download(path, pics): """-图片""" num = 1 row = list(range(1, 30)) print(f"开始- >>> {os.path.split(path)[1]} >> 共{len(pics)}张") for pic in pics: print(num, end=' ') if num//30 in row: print() row.pop(0) try: with open('\\'.join([path, str(num) + '.jpg']), 'wb') as f: f.write(requests.get(pic, verify=False).content) except Exception as e: print("出现错误!请耐心等待5s!") time.sleep(5) with open('\\'.join([path, str(num) + '.jpg']), 'wb') as f: f.write(requests.get(pic, verify=False).content) num += 1 jpg_to_pdf(path) shutil.rmtree(path)def jpg_to_pdf(path): """生成PDF文件""" print(f"\n--->>> 正在图片转pdf文件 文件路径{path}.pdf") jpg_path = glob.glob(f"{path}\*.jpg") jpg_path.sort(key=lambda x: int(os.path.basename(x).split('.')[0])) w, h = Image.open(jpg_path[0]).size ca = canvas.Canvas(path + '.pdf', pagesize=portrait((w, h))) for jpg in jpg_path: ca.drawImage(jpg, 0, 0, w, h) ca.showPage() ca.save()def select_section(section, chapters): """选择-范围""" sec = int(section) name = f'{(sec - 1) * 10+1}-{sec * 10}' if sec * 10 > len(chapters[1])+14: print(f"漫画一共才更到{len(chapters[1])+4}话,你想-{(sec-1)*10+1}-{sec*10},你有毛病吧!") exit() if sec < 43: chapter = chapters[1][(sec - 1) * 10:sec * 10] elif sec == 43: chapter = chapters[1][(sec - 1) * 10:sec * 10 - 4] print("注意,缺少425-428话!") elif sec*10 < len(chapters[1])+4: chapter = chapters[1][(sec - 1) * 10 - 4:sec * 10 - 4] else: print(f"漫画一共才更到{len(chapters[1])+4}话,所以只能-{(sec-1)*10+1}-{len(chapters[1])+4} o(╥﹏╥)o") chapter = chapters[1][(sec-1)*10-4:] name = f"{(sec-1)*10+1}-{len(chapters[1])+4}" return chapters[0]+f"\\{name}章", chapterdef main(): _url = ' print("输入1,则-1-10话,输入2,则-11-20话,以此类推......") _section = input("请输入指定数字:") _chapter = get_chapter(_url) _chapters = select_section(_section, _chapter) get_pic_linking(_chapters)if __name__ == '__main__': main()
因为这个网站,少了425-428的章节,见下图
所以使用了一个函数做判断(若网站以后更新有了这些章节,小伙伴们可自行更改喔,或者私信给我哈):
def select_section(section, chapters): """选择-范围""" sec = int(section) name = f'{(sec - 1) * 10+1}-{sec * 10}' if sec * 10 > len(chapters[1])+14: print(f"漫画一共才更到{len(chapters[1])+4}话,你想-{(sec-1)*10+1}-{sec*10},你有毛病吧!") exit() if sec < 43: chapter = chapters[1][(sec - 1) * 10:sec * 10] elif sec == 43: chapter = chapters[1][(sec - 1) * 10:sec * 10 - 4] print("注意,缺少425-428话!") elif sec*10 < len(chapters[1])+4: chapter = chapters[1][(sec - 1) * 10 - 4:sec * 10 - 4] else: print(f"漫画一共才更到{len(chapters[1])+4}话,所以只能-{(sec-1)*10+1}-{len(chapters[1])+4} o(╥﹏╥)o") chapter = chapters[1][(sec-1)*10-4:] name = f"{(sec-1)*10+1}-{len(chapters[1])+4}" return chapters[0]+f"\\{name}章", chapter
进度条展示
from requests.packages.urllib3.exceptions import InsecureRequestWarningfrom reportlab.lib.pagesizes import portraitfrom reportlab.pdfgen import canvasfrom pyquery import PyQueryfrom PIL import Imageimport requestsimport shutilimport timeimport globimport osrequests.packages.urllib3.disable_warnings(InsecureRequestWarning)headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.45 Safari/537.36',}def get_chapter(url): """获取每章链接""" html = requests.get(url, headers=headers) html.encoding = 'utf-8' folder_path = '\\'.join([os.getcwd(), PyQuery(html.text)('title').text().split('_')[0]]) if not os.path.exists(folder_path): os.mkdir(folder_path) chapters = [[a.text(), a.attr('href')] for a in PyQuery(html.text)('#chapter-list-4 li a').items() if a.text().split('.')[0].isdigit() or a.text()[0].isdigit()] chapters.reverse() return folder_path, chaptersdef get_pic_linking(path_chapters): """获取图片链接""" folder_path, chapters = path_chapters pics_linking = [] for name, chapter in chapters: html = requests.get(chapter, headers=headers) html.encoding = 'utf-8' pic_linking = [pic_url.attr('src') for pic_url in PyQuery(html.text)('div > mip-link mip-img').items()] pics_linking += pic_linking if not os.path.exists(folder_path): os.mkdir(folder_path) try: img_download(folder_path, pics_linking) except Exception as e: print(e) print("出错了,请重新尝试o(╥﹏╥)o") shutil.rmtree(folder_path)def img_download(path, pics): """-图片""" print(f"开始- >>> {os.path.split(path)[1]} >> 共{len(pics)}张") for num, pic in enumerate(pics): print(f'\r{"▇" * ((num + 1) // 2)} {(num + 1) / len(pics) * 100:.0f}%', end='') try: with open('\\'.join([path, str(num + 1) + '.jpg']), 'wb') as f: f.write(requests.get(pic, verify=False).content) except Exception as e: time.sleep(5) with open('\\'.join([path, str(num + 1) + '.jpg']), 'wb') as f: f.write(requests.get(pic, verify=False).content) jpg_to_pdf(path) shutil.rmtree(path)def jpg_to_pdf(path): """生成PDF文件""" print(f"\n--->>> 正在图片转pdf文件 文件路径{path}.pdf") jpg_path = glob.glob(f"{path}\*.jpg") jpg_path.sort(key=lambda x: int(os.path.basename(x).split('.')[0])) w, h = Image.open(jpg_path[0]).size ca = canvas.Canvas(path + '.pdf', pagesize=portrait((w, h))) for jpg in jpg_path: ca.drawImage(jpg, 0, 0, w, h) ca.showPage() ca.save()def select_section(section, chapters): """选择-范围""" sec = int(section) name = f'{(sec - 1) * 10 + 1}-{sec * 10}' if sec * 10 > len(chapters[1]) + 14: print(f"漫画一共才更到{len(chapters[1]) + 4}话,你想-{(sec - 1) * 10 + 1}-{sec * 10},你有毛病吧!") exit() if sec < 43: chapter = chapters[1][(sec - 1) * 10:sec * 10] elif sec == 43: chapter = chapters[1][(sec - 1) * 10:sec * 10 - 4] print("注意,缺少425-428话!") elif sec * 10 < len(chapters[1]) + 4: chapter = chapters[1][(sec - 1) * 10 - 4:sec * 10 - 4] else: print(f"漫画一共才更到{len(chapters[1]) + 4}话,所以只能-{(sec - 1) * 10 + 1}-{len(chapters[1]) + 4} o(╥﹏╥)o") chapter = chapters[1][(sec - 1) * 10 - 4:] name = f"{(sec - 1) * 10 + 1}-{len(chapters[1]) + 4}" return chapters[0] + f"\\{name}章", chapterdef main(): _url = ' print("输入1,则-1-10话,输入2,则-11-20话,以此类推......") _section = input("请输入指定数字:") _chapter = get_chapter(_url) _chapters = select_section(_section, _chapter) get_pic_linking(_chapters)if __name__ == '__main__': main()
我用的是PyCharm运行的,貌似用自带的IDLE不可以,怪怪的
箴言:因为这些东西是非常简单的。不要抱怨自己学不会,那是因为你没有足够用心。
版权声明:本文内容由网络用户投稿,版权归原作者所有,本站不拥有其著作权,亦不承担相应法律责任。如果您发现本站中有涉嫌抄袭或描述失实的内容,请联系我们jiasou666@gmail.com 处理,核实后本网站将在24小时内删除侵权内容。
发表评论
暂时没有评论,来抢沙发吧~