1kkk

网友投稿 913 2022-10-02

1kkk

1kkk

给基友-漫画看

代码:

1 # !usr/bin/python3.4 2 # -*- coding:utf-8 -*- 3 4 import requests 5 import os 6 import time 7 import re 8 from lxml import etree 9 import random 10 11 def geturl(url,postdata): 12 header = {'User-Agent': 13 'Mozilla/5.0 (iPad; U; CPU OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5', 14 'Referer':'15 'Host': 'manhua1023.61-174-50-131.cdndm5.com', 16 'Accept': 'image/png,image/*;q=0.8,*/*;q=0.5', 17 'Accept-Encoding': 'gzip, deflate', 18 'Accept-Language': 'zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3', 19 'Connection': 'keep-alive', 20 } 21 22 s = requests.Session() 23 r = s.post('= postdata) 24 _cookies = r.cookies 25 #print(r.content) 26 rs = s.get(url, headers=header,cookies = _cookies) 27 return rs 28 29 30 def get(url): 31 header = {'User-Agent': 32 'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:46.0) Gecko/20100101 Firefox/46.0', 33 'Referer': '34 'Host': '1kkk.com'} 35 36 # 解析网页 37 html_bytes = requests.get(url, headers=header) 38 39 return html_bytes 40 41 def mget(url): 42 header = {'User-Agent': 43 'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:46.0) Gecko/20100101 Firefox/46.0', 44 'Referer': '45 'Host': 'm.1kkk.com'} 46 47 # 解析网页 48 html_bytes = requests.get(url, headers=header) 49 50 return html_bytes 51 52 53 # 去除标题中的非法字符 (Windows) 54 def validateTitle(title): 55 # '/\:*?"<>|' 56 rstr = r"[\/\\\:\*\?\"\<\>\|]" 57 new_title = re.sub(rstr, "", title) 58 return new_title 59 60 61 def prints(timesleep): 62 print('暂停' + str(timesleep) + '秒后开始批量-图片,请保持网络畅通...') 63 time.sleep(timesleep) 64 65 # 解析js 66 def regnext(js): 67 reg = r'(var.+?.split)' 68 all = re.compile(reg); 69 alllist = re.findall(all, js) 70 return alllist 71 72 # 递归创建文件夹 73 def createjia(path): 74 try: 75 os.makedirs(path) 76 except: 77 print('目录已经存在:' + path) 78 79 80 if __name__ == '__main__': 81 82 html = get(''ignore') 83 84 page = etree.HTML(html.lower()) 85 # 得到网址后缀 86 hrefs = page.xpath('//ul[@class="sy_nr1 cplist_ullg"][2]/li/a/@href') 87 # 得到编号 88 hrefnames = page.xpath('//ul[@class="sy_nr1 cplist_ullg"][2]/li/a/text()') 89 # 得到页数 90 hrefpages = page.xpath('//ul[@class="sy_nr1 cplist_ullg"][2]/li/text()') 91 92 href = [] 93 hrefname = [] 94 hrefpage = [] 95 number = 1 96 97 # 不知道里面那几卷是不是漫画里面的 98 # 先抓下来再说 99 # 得到网址后缀100 for temp in hrefs:101 towurl = temp102 href.append(towurl)103 # 得到编号104 for temp in hrefnames:105 hrefname.append(temp)106 # 得到页数107 for temp in hrefpages:108 hrefpage.append(temp.replace("页", ""))109 110 j = 0111 filenamep = '../data/' + str(hrefname[0]) + "/"112 createjia(filenamep)113 114 for i in range(0, len(href)):115 for j in range(len(hrefpage)):116 117 # 6871、6872。。118 hrefnumber = str(href[i]).replace("ch54-","").replace("/","").replace("vol1-","")119 #print(hrefnumber)120 # 构造jsurl121 # 得到122 # jsurl = "+ str(href[i]) + "/imagefun.ashx?cid=" + str(hrefnumber) + "&page=" + str(j + 1) + "&key=65abd421f4aed565&maxcount=10"124 print(jsurl)125 126 # 构造image网址127 html = get(jsurl).content.decode('utf-8', 'ignore')128 html1 = regnext(html)129 html1 = html1[0].replace("'.split", "").split('|')130 131 # image_1url = "+ str(html1[19]) + "-" + str(html1[18]) + "-" + str(html1[9]) + "-" + str(133 html1[10]) + ".cdndm5.com/1/589/" + str(href[i]) + "/" + str(html1[20]) + "?cid=" + str(6871) + "&key=" + str(134 html1[8]) + "&type=1"135 print(image_1url)136 137 # 构造image网址138 filess = open(filenamep + str(j + 1) + '.jpg', 'wb')139 140 # 伪装posrdata141 postdata = {142 'cid': 6871,143 'language': 1,144 'mid': 589,145 'page': j + 1,146 'tp': 8,147 'uid': 0148 }149 150 # 即使正确的网址也是不能-151 pic = geturl(image_1url,postdata)152 filess.write(pic.content)153 filess.close()154 print('已经写入第' + str(j + 1) + '张图片')155 j = j + 1156 157 # 每一次-都暂停1-3秒158 loadimg = random.randint(1, 3)159 print('暂停' + str(loadimg) + '秒')160

selenium抓取:

版权声明:本文内容由网络用户投稿,版权归原作者所有,本站不拥有其著作权,亦不承担相应法律责任。如果您发现本站中有涉嫌抄袭或描述失实的内容,请联系我们jiasou666@gmail.com 处理,核实后本网站将在24小时内删除侵权内容。

上一篇:微信小程序如何实现滚动加载数据?(微信小程序如何实现滚动加载数据截图)
下一篇:Linux下升级python
相关文章

 发表评论

暂时没有评论,来抢沙发吧~