#coding=utf-8
import requestsimport re,os,time,ConfigParserfrom selenium import webdriverfrom multiprocessing.dummy import Pool######单进程#####
#创建保存截图的目录
def createImagesPath(): dirname=os.path.dirname(os.path.abspath(__file__)) #print dirname imges_path=os.path.join(dirname,time.strftime("%Y%m%d")) #print imges_path try: if not os.path.exists(imges_path): os.mkdir(imges_path) print u"截图保存的目录:",imges_path return imges_path except Exception,e: print e
#从文件中获取要爬的网站地址
def getWebUrls(web_urls_file_path): web_urls=[] try: with open(web_urls_file_path) as fp: lines=fp.readlines() for line in lines: if line.strip(): web_urls.append(line.strip()) return web_urls except Exception,e: print e#获取单个网站的所有有效链接
def getLinks(web_url): try: response=requests.get(web_url) #print response html=response.text links=re.findall(r'href="(.*?)"',html) valid_links=[] invalid_links=[] for link in links: if link.strip().startswith("//"): valid_links.append("http:"+link.strip()) elif link.strip()=="" or link.strip()=="#" or link.strip()=="/" or link.strip().count("javascript")>=1 or link.strip().count("mailto:") >= 1: invalid_links.append(link) elif re.search(r"(\.jpg)|(\.jpeg)|(\.gif)|(\.ico)|(\.png)|(\.js)|(\.css)$",link.strip()) or re.match(r'/[^/].*',link.strip()): invalid_links.append(link) else: valid_links.append(link.strip()) valid_links=list(set(valid_links)) return valid_links except Exception,e: print e#保存有效的链接到.txt文件
def saveLinks(links): dirname=os.path.dirname(os.path.abspath(__file__)) #print dirname links_path=os.path.join(dirname,time.strftime("%Y%m%d")) #print links_path try: if not os.path.exists(links_path): os.mkdir(links_path) links_file_path=os.path.join(links_path,"links.txt") print u"链接保存的路径:",links_file_path with open(links_file_path,"w") as fp: fp.writelines([link+"\n" for link in links]) except Exception,e: print e#模拟浏览器打开链接并保存截图
class OpenLinkAndSaveImg(object): def __init__(self,browser_type): try: configFilePath=os.path.dirname(os.path.abspath(__file__))+"\\browserAndDriver.ini" print u"浏览器驱动配置文件路径:",configFilePath cf=ConfigParser.ConfigParser() cf.read(configFilePath) browser_type=browser_type.strip().lower() driver_path=cf.get("browser_driver",browser_type.strip()).strip() print u"浏览器:%s ,驱动位置:%s"%(browser_type,driver_path) if browser_type=="ie": self.driver=webdriver.Ie(executable_path=eval(driver_path)) elif browser_type=="chrome": self.driver=webdriver.Chrome(executable_path=eval(driver_path)) elif browser_type=="firefox": self.driver=webdriver.Firefox(executable_path=eval(driver_path)) else: print "invalid browser!" except Exception,e: print e #打开链接并保存截图 def openLinkAndSaveImg(self,link_index_imgspath): try: link,index,imgspath=link_index_imgspath self.driver.get(link) self.driver.maximize_window() self.driver.get_screenshot_as_file(os.path.join(imgspath,str(index+1)+".png")) except Exception,e: print e def end(self): self.driver.quit() if __name__=="__main__":#单进程
imgs_path=createImagesPath() #weburls=getWebUrls(r"e:\\urls.txt") weburls=getWebUrls(os.path.dirname(os.path.abspath(__file__))+"\\urls.txt") links=[] start_time=time.time() for weburl in weburls: links+=getLinks(weburl) print u"链接数:%s ;获取所有链接耗时:%s"%(len(links),time.time()-start_time) saveLinks(links) start_time1=time.time() open_link_and_save_img=OpenLinkAndSaveImg("ie") for i in range(len(links)): open_link_and_save_img.openLinkAndSaveImg((links[i],i,imgs_path)) open_link_and_save_img.end() print u"单进程打开所有链接并截图耗时耗时:",time.time()-start_time1
######多线程(线程池并发执行)######
#coding=utf-8import requestsimport re,os,time,ConfigParserfrom selenium import webdriverfrom multiprocessing.dummy import Pool#创建保存截图的目录
def createImagesPath(): dirname=os.path.dirname(os.path.abspath(__file__)) #print dirname imges_path=os.path.join(dirname,time.strftime("%Y%m%d")) #print imges_path try: if not os.path.exists(imges_path): os.mkdir(imges_path) print u"所有截图保存的目录:",imges_path return imges_path except Exception,e: print e#从文件中获取要爬的网站地址
def getWebUrls(web_urls_file_path): web_urls=[] try: with open(web_urls_file_path) as fp: lines=fp.readlines() for line in lines: if line.strip(): web_urls.append(line.strip()) return web_urls except Exception,e: print e#获取单个网站的所有有效链接
def getLinks(web_url): try: response=requests.get(web_url) #print response html=response.text links=re.findall(r'href="(.*?)"',html) valid_links=[] invalid_links=[] for link in links: if link.strip().startswith("//"): valid_links.append("http:"+link.strip()) elif link.strip()=="" or link.strip()=="#" or link.strip()=="/" or link.strip().count("javascript")>=1 or link.strip().count("mailto:") >= 1: invalid_links.append(link) elif re.search(r"(\.jpg)|(\.jpeg)|(\.gif)|(\.ico)|(\.png)|(\.js)|(\.css)$",link.strip()) or re.match(r'/[^/].*',link.strip()): invalid_links.append(link) else: valid_links.append(link.strip()) valid_links=list(set(valid_links)) return valid_links except Exception,e: print e#保存有效的链接到.txt文件
def saveLinks(links): dirname=os.path.dirname(os.path.abspath(__file__)) #print dirname links_path=os.path.join(dirname,time.strftime("%Y%m%d")) #print links_path try: if not os.path.exists(links_path): os.mkdir(links_path) links_file_path=os.path.join(links_path,"links.txt") print u"所有链接保存的路径:",links_file_path with open(links_file_path,"w") as fp: fp.writelines([link+"\n" for link in links]) except Exception,e: print e#获取浏览器和驱动
def getBrowserAndDriver(browser_type): try: configFilePath=os.path.dirname(os.path.abspath(__file__))+"\\browserAndDriver.ini" print u"浏览器驱动配置文件路径:",configFilePath cf=ConfigParser.ConfigParser() cf.read(configFilePath) browser_type=browser_type.strip().lower() driver_path=cf.get("browser_driver",browser_type.strip()).strip() print u"浏览器:%s ,驱动位置:%s"%(browser_type,driver_path) return browser_type,driver_path except Exception,e: print e#打开链接并保存截图
def openLinkAndSaveImg(browser_driver_link_index_imgspath): try: browser,driverpath,link,index,imgspath=browser_driver_link_index_imgspath command="webdriver."+browser.capitalize()+"(executable_path="+driverpath+")" driver=eval(command) driver.get(link) driver.maximize_window() driver.get_screenshot_as_file(os.path.join(imgspath,str(index+1)+".png")) driver.quit() except Exception,e: print eif __name__=="__main__":
imgs_path=createImagesPath() #weburls=getWebUrls(r"e:\\urls.txt") weburls=getWebUrls(os.path.dirname(os.path.abspath(__file__))+"\\urls.txt") p=Pool(5) start_time1=time.time() links_list=p.map(getLinks,weburls) end_time1=time.time() links=[] for link_list in links_list: links+=link_list saveLinks(links) print u"链接数:%s ;获取所有链接耗时:%s"%(len(links),end_time1-start_time1) browser,driver=getBrowserAndDriver("ie") browser_driver_link_index_imgspath=zip([browser]*len(links),[driver]*len(links),links,range(len(links)),[imgs_path]*len(links)) start_time2=time.time() p.map(openLinkAndSaveImg,browser_driver_link_index_imgspath) p.close() p.join() print u"多线程打开所有链接并截图耗时:",time.time()-start_time2
######多线程######
#coding=utf-8import requestsimport re,os,time,ConfigParserfrom selenium import webdriverfrom multiprocessing.dummy import Poolimport Queueimport threading
#创建保存截图的目录
def createImagesPath(): dirname=os.path.dirname(os.path.abspath(__file__)) #print dirname imges_path=os.path.join(dirname,time.strftime("%Y%m%d")) #print imges_path try: if not os.path.exists(imges_path): os.mkdir(imges_path) print u"所有截图保存的目录:",imges_path return imges_path except Exception,e: print e#从文件中获取要爬的网站地址
def getWebUrls(web_urls_file_path): web_urls=[] try: with open(web_urls_file_path) as fp: lines=fp.readlines() for line in lines: if line.strip(): web_urls.append(line.strip()) return web_urls except Exception,e: print e#获取单个网站的所有有效链接
def getLinks(web_url): try: response=requests.get(web_url) #print response html=response.text links=re.findall(r'href="(.*?)"',html) valid_links=[] invalid_links=[] for link in links: if link.strip().startswith("//"): valid_links.append("http:"+link.strip()) elif link.strip()=="" or link.strip()=="#" or link.strip()=="/" or link.strip().count("javascript")>=1 or link.strip().count("mailto:") >= 1: invalid_links.append(link) elif re.search(r"(\.jpg)|(\.jpeg)|(\.gif)|(\.ico)|(\.png)|(\.js)|(\.css)$",link.strip()) or re.match(r'/[^/].*',link.strip()): invalid_links.append(link) else: valid_links.append(link.strip()) valid_links=list(set(valid_links)) return valid_links except Exception,e: print e#保存有效的链接到.txt文件(当前目录\年月日\links.txt)
def saveLinks(links): dirname=os.path.dirname(os.path.abspath(__file__)) #print dirname links_path=os.path.join(dirname,time.strftime("%Y%m%d")) #print links_path try: if not os.path.exists(links_path): os.mkdir(links_path) links_file_path=os.path.join(links_path,"links.txt") print u"所有链接保存的路径:",links_file_path with open(links_file_path,"w") as fp: fp.writelines([link+"\n" for link in links]) except Exception,e: print e#多线程
class MyThread(threading.Thread): def __init__(self,browser,queue): threading.Thread.__init__(self) self.queue=queue try: configFilePath=os.path.dirname(os.path.abspath(__file__))+"\\browserAndDriver.ini" #print u"浏览器驱动配置文件路径:",configFilePath cf=ConfigParser.ConfigParser() cf.read(configFilePath) browser_type=browser.strip().lower() driver_path=cf.get("browser_driver",browser_type.strip()).strip() #print u"浏览器:%s ,驱动位置:%s"%(browser_type,driver_path) if browser_type=="ie": self.driver=webdriver.Ie(executable_path=eval(driver_path)) elif browser_type=="chrome": self.driver=webdriver.Chrome(executable_path=eval(driver_path)) elif browser_type=="firefox": self.driver=webdriver.Firefox(executable_path=eval(driver_path)) else: print "invalid browser!"except Exception,e:
print edef run(self):
print "Starting"+self.name openLinkAndSaveImg(self.driver,self.queue) self.driver.quit()#打开链接并保存截图
def openLinkAndSaveImg(driver,queue): while not queue.empty(): queueLock.acquire() link_index_imgspath=queue.get() queueLock.release() try: link,index,imgspath=link_index_imgspath driver.get(link) driver.maximize_window() driver.get_screenshot_as_file(os.path.join(imgspath,str(index+1)+".png")) except Exception,e: print eif __name__=="__main__":
#多线程 imgs_path=createImagesPath() #weburls=getWebUrls(r"e:\\urls.txt") weburls=getWebUrls(os.path.dirname(os.path.abspath(__file__))+"\\urls.txt") p=Pool(5) start_time1=time.time() links_list=p.map(getLinks,weburls) end_time1=time.time() links=[] for link_list in links_list: links+=link_list saveLinks(links) print u"链接数:%s ;获取所有链接耗时:%s"%(len(links),end_time1-start_time1)link_index_imgspath=zip(links,range(len(links)),[imgs_path]*len(links))
queueLock=threading.Lock() threads=[] link_index_imgspath_Queue=Queue.Queue(len(links)) for element in link_index_imgspath: link_index_imgspath_Queue.put(element)start_time2=time.time()
for i in range(5):
thread=MyThread("ie",link_index_imgspath_Queue) thread.start() threads.append(thread)for t in threads:
t.join()print u"多线程打开所有链接并截图耗时:",time.time()-start_time2
print "end!"