本帖最后由 zyjsuper 于 2020-8-22 21:38 编辑
Selenium的效率的确不敢恭维啊,不知道哪位大神分享下并发的策略,不胜感激啊。
[Python] 纯文本查看 复制代码 #!/usr/bin/env python
# -*- encoding: utf-8 -*-
'''
@Author : zyjsuper
@License : (C) Copyright 2013-2020
@Contact : [url=mailto:[email protected]][email protected][/url]
@File : MztSpider.py
@Time : 2020/8/17 20:23
@Desc :
'''
from selenium import webdriver
import requests,os
def get_pic(page):
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument('--headless')
browser = webdriver.Chrome(options=chrome_options)
browser.get("https://www.mzitu.com/page/" + str(page))
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3314.0 Safari/537.36 SE 2.X MetaSr 1.0',
'Referer': 'https://www.mzitu.com/**/'
}
username = os.getenv("USERNAME")
savepath = "C:\\Users\\"+ username + "\\Desktop\\Meizitu"
try:
os.mkdir(savepath)
except:
pass
os.mkdir(savepath + "\\page-" + str(page) + "\\")
links = browser.find_elements_by_xpath("//ul[@id='pins']/li/a")
urls = []
for link in links:
url = link.get_attribute("href")
urls.append(url)
for url in urls:
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument('--headless')
browser = webdriver.Chrome(options=chrome_options)
browser.get(url)
pic_url = browser.find_element_by_xpath("//div[@class='main-image']//p//a//img").get_attribute("src")
name = str(pic_url).split('/')[-1]
response = requests.get(pic_url,headers=headers)
print("获取图片%s,图片地址为%s。" %(name,pic_url))
with open(savepath + "\\page-" + str(page) + "\\" + name,"wb") as file:
file.write(response.content)
browser.quit()
if __name__ == '__main__':
for p in range(1,254): #从第一页到第254页
get_pic(p)
|