微前端架构如何改变企业的开发模式与效率提升
7847
2022-11-15
python爬虫—其它
文章目录
爬取西刺IP代理网站豆瓣美女图片豆瓣电影导演评分等化学品供应信息
爬取西刺IP代理网站
import refrom bs4 import BeautifulSoupimport chardetimport requestsimport xlwtimport randomimport redef Get_Html(Page_num): USER_AGENTS = [ "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1", "Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6", "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6", "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1", "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5", "Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3"] # 浏览器 proxies = [{"HTTP": "117.63.78.64:6666"}, {"HTTPS": "114.225.169.215:53128"}, {"HTTPS": "222.185.22.108:6666"}] # 代理IP url = " r = requests.get(url, headers={"User-Agent": random.choice(USER_AGENTS)}, proxies=random.choice(proxies)) code = chardet.detect(r.content)['encoding'] return r.content.decode(code)data = []def Get_data(Page_num): for index in range(Page_num): html = Get_Html(index+1) print('正在获取第{}页数据'.format(index+1)) soup = BeautifulSoup(html,"html.parser") parents = soup.find("table",attrs={'id':"ip_list"}) TRS = parents.find_all("tr") TRS.pop(0) reg = re.compile('(\d+)天$') for td in TRS: tds = td.find_all('td') IP_adress = tds[1].string duankou_IP =tds[2].get_text() town_adress = tds[4].get_text() leixing_IP = tds[5].get_text() alive_time = re.findall(reg,tds[8].get_text()) fabu_time = tds[9].get_text() data.append([IP_adress+":"+duankou_IP,town_adress,leixing_IP,alive_time,fabu_time]) print(data) return datadef Save_Excel(): wbk = xlwt.Workbook(encoding="utf-8") sheet1 = wbk.add_sheet("西刺网站IP地址") fields = ["IP地址:端口号","地区","HTTPs/HTTP","存活时间(天)","发布时间"] for i in range(len(fields)): sheet1.write(0,i,fields[i]) for j in range(len(data)): for k in range(len(fields)): sheet1.write(j+1,k,data[j][k]) wbk.save("西刺网站IP地址.xls")def main(Page_num): Get_data(Page_num) Save_Excel()main(5)
豆瓣美女图片
#encoding:utf-8print('------------豆瓣美女--------------')# 方法,使用urllib.urlretrieve() 方法直接将远程数据-到本地import requestsfrom bs4 import BeautifulSoupimport urllibimport osaddress = str(1049732)# 设置获取网页内容的函数def getHtml(index,number): url = "+ str(index) r = requests.get(url,{"User-Agent":"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11"}) soup = BeautifulSoup(r.text,"html.parser") return soupimages = []imgLen = len(getHtml(0,address).find('ul', attrs={'class': "poster-col3 clearfix"}).find_all('img'))def getImages(pageNum,name,number): if os.path.exists(name): os.rmdir("photos") else: os.mkdir(name) os.chdir(name) global address,images,imgLen for k in range(pageNum): eachsoup = getHtml(k*imgLen,number) imageList = eachsoup.find('ul', attrs={'class': "poster-col3 clearfix"}) # 3、获取所有image # 通过extend方法,还是一个list,如果用append会是多个list,下面的循环的就要额外处理了 images.extend(imageList.find_all('img')) #3、用循环处理所有li内的具体内容 for i in range(len(images)): try: suffix = images[i]['src'][-3:] image_name = str(i+1)+'.'+suffix urllib.urlretrieve(images[i]['src'],image_name) except Exception: print('存储有异常') returngetImages(1,'liyifei1',address)
豆瓣电影导演评分等
#encoding:utf-8import requestsfrom bs4 import BeautifulSoupimport chardetimport reimport xlwtimport randomfrom urllib import request#获取某页的内容def getHtml(page_num): USER_AGENTS = ["Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1", "Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6", "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6", "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1", "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5", "Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3"] # 浏览器 proxies = ["110.72.36.241:8123","121.31.100.224:8123", "220.171.89.66:808","61.135.217.7:80","121.31.192.227:8123"] # 代理IP url = "+ str(page_num*25) + "&filter=" r = requests.get(url, headers={"User-Agent":random.choice(USER_AGENTS)}, proxies={"random.choice(proxies)}) code = chardet.detect(r.content)["encoding"] return r.content.decode(code)dataList = []def get_Data(num): for i in range(num): html = getHtml(i) soup = BeautifulSoup(html,"html.parser") print(soup) DIVparent = soup.find("div",attrs={"id":"content"}) #寻找父类 lis = DIVparent.find_all("li") reg = re.compile('.*(\d{4}).*') for i in lis: data = [] titles = i.find("span",attrs={"class":"title"}).string data.append(titles) #电影名 years_str = i.find('div',attrs={'class':'bd'}).find('p').get_text() years = re.findall(reg,years_str)[0] data.append(years) #发行时间 actor = i.find("p").get_text().strip().split('\t') data.append(actor[0]) # 导演 count_num = i.find("span",attrs={"class":"rating_num",'property':"v:average"}).string data.append(count_num) #电影评分 person_num = i.find('div', attrs={'class': 'star'}).find_all('span')[3].get_text() number = re.findall(re.compile('\d*'), person_num)[0] data.append(number) #评价人数 if i.find('div', attrs={'class': 'bd'}).find('p', attrs={'class': 'quote'}): pinglun = i.find('div', attrs={'class': 'bd'}).find('p', attrs={'class': 'quote'}).find( 'span').get_text() else: pinglun = '' #简评 data.append(pinglun) dataList.append(data) return dataListdef save_Excel(): book = xlwt.Workbook(encoding="utf-8") # 创建工作簿 sheet = book.add_sheet("daouban_top250") col = ('电影名', '发行时间','导演与主演', '电影评分', '评价人数','简评') for i in range(len(col)): sheet.write(0, i, col[i]) for i in range(len(dataList)): # 控制行 for j in range(len(dataList[i])): # 控制列 sheet.write(i + 1, j, dataList[i][j]) book.save('daouban_top250.xls')def main(num): get_Data(num) save_Excel()main(2)
化学品供应信息
import chardetimport randomimport requestsfrom bs4 import BeautifulSoupimport timeimport xlwtdef Get_Html(Page_num): url = " USER_AGENTS = [ "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1", "Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6", "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6", "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1", "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5", "Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3"] # 浏览器 proxies = [{"HTTP": "117.63.78.64:6666"}, {"HTTPS": "114.225.169.215:53128"}, {"HTTPS": "222.185.22.108:6666"}] # 代理IP r = requests.get(url, headers={'User-Agent':random.choice(USER_AGENTS)}, proxies=random.choice(proxies)) code = chardet.detect(r.content)['encoding'] return r.content.decode(code)data = []def Get_Data(Page_num): for i in range(Page_num): html = Get_Html(i) print('正在获取第{}页...'.format(i+1)) time.sleep(3) print('正在休息...') soup = BeautifulSoup(html,"html.parser") parents = soup.find('div',attrs={'class':'main_960px'}) TRs = parents.find_all('tr',attrs={'style':'background-color:#EEEEEE;'}) for td in TRs: tds = td.find_all('td') chemical_information = [] for j in tds: if j.string != None: chemical_info = j.get_text().strip() elif j.find('a') !=None: chemical_info = j.find('a').get_text().strip() else: chemical_info = '' chemical_information.append(chemical_info) data.append(chemical_information) print('==========================', data) return datadef Save_Excel(): wbk = xlwt.Workbook(encoding="utf-8") sheet1 = wbk.add_sheet("化学品供应信息") fields = ["发布日期","产品名称","产品英文名称","CAS号","包装与价格信息",'公司名称'] for i in range(len(fields)): sheet1.write(0,i,fields[i]) for j in range(len(data)): for k in range(len(fields)): sheet1.write(j+1,k,data[j][k]) wbk.save("化学品供应信息.xls")def main(Page_num): Get_Data(Page_num) Save_Excel()main(10)
版权声明:本文内容由网络用户投稿,版权归原作者所有,本站不拥有其著作权,亦不承担相应法律责任。如果您发现本站中有涉嫌抄袭或描述失实的内容,请联系我们jiasou666@gmail.com 处理,核实后本网站将在24小时内删除侵权内容。
发表评论
暂时没有评论,来抢沙发吧~