本篇文章小编给大家分享一下python三个爬虫项目代码实例,小编觉得挺不错的,现在分享给大家供大家参考,有需要的小伙伴们可以来看看。
爬取内涵段子:
#encoding=utf-8
import urllib2
import re
class neihanba():
def spider(self):
'''
爬虫的主调度器
'''
isflow=True#判断是否进行下一页
page=1
while isflow:
url="http://www.neihanpa.com/article/list_5_"+str(page)+".html"
html=self.load(url)
self.deal(html,page)
panduan=raw_input("是否继续(y/n)!")
if panduan=="y":
isflow=True
page+=1
else:
isflow=False
def load(self,url):
'''
针对url地址进行全部爬去
:param url: url地址
:return: 返回爬去的内容
'''
header = {
"User-Agent": " Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.79 Safari/537.36"
}
request = urllib2.Request(url, headers=header)
response = urllib2.urlopen(request)
html = response.read()
return html
def deal(self,html,page):
'''
对之前爬去的内容进行正则匹配,匹配出标题和正文内容
:param html:之前爬去的内容
:param page: 正在爬去的页码
'''
parrten=re.compile('
(.*?)',re.S)
titleList=parrten.findall(html)
for title in titleList:
parrten1=re.compile('(.*)')
ti1=parrten1.findall(title)
parrten2=re.compile('(.*?)',re.S)
til2=parrten2.findall(title)
for t in ti1:
tr=t.replace("","").replace("","")
self.writeData(tr,page)
for t in til2:
tr=t.replace("
","").replace("","").replace("
","").replace("
","").replace("&ldquo",""").replace("&rdquo",""")
self.writeData(tr,page)
def writeData(self,context,page):
'''
将最终爬去的内容写入文件中
:param context: 匹配好的内容
:param page: 当前爬去的页码数
'''
fileName = "di" + str(page) + "yehtml.txt"
with open(fileName, "a") as file:
file.writelines(context + "n")
if __name__ == '__main__':
n=neihanba()
n.spider()
爬取智联:
#encoding=utf-8
import urllib
import urllib2
import re
class zhiLian():
def spider(self,position,workPlace):
'''
爬虫的主调度器
:param position: 职位
:param workPlace: 工作地点
'''
url="http://sou.zhaopin.com/jobs/searchresult.ashx?"
url+=urllib.urlencode({"jl":workPlace})
url+="&"
url+=urllib.urlencode({"kw":position})
isflow=True#是否进行下一页的爬去
page=1
while isflow:
url+="&"+str(page)
html=self.load(url)
self.deal1(html,page)
panduan = raw_input("是否继续爬虫下一页(y/n)!")
if panduan == "y":
isflow = True
page += 1
else:
isflow = False
def load(self,url):
'''
针对url地址进行全部爬去
:param url: url地址
:return: 返回爬去的内容
'''
header = {
"User-Agent": " Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.79 Safari/537.36"
}
request = urllib2.Request(url, headers=header)
response = urllib2.urlopen(request)
html = response.read()
return html
def deal1(self,html,page):
'''
对之前爬去的内容进行正则匹配,匹配职位所对应的链接
:param html:之前爬去的内容
:param page: 正在爬去的页码
'''
parrten=re.compile('.*?',re.S)
til=parrten.findall(html)#爬去链接
for t in til:
self.deal2(t,page)
def deal2(self,t,page):
'''
进行二次爬虫,然后在新的页面中对公司、薪资、工作经验进行匹配
:param t: url地址
:param page: 当前匹配的页数
'''
html=self.load(t)#返回二次爬虫的内容
parrten1=re.compile('(.*?)s+.*?
',re.S)
parrten2=re.compile('
职位月薪:(.*?) .*?',re.S)
parrent3=re.compile('
工作经验:(.*?)',re.S)
til1=parrten1.findall(html)
til2=parrten2.findall(html)
til3=parrent3.findall(html)
str=""
for t in til1:
t=t.replace('',"")
str+=t
str+="t"
for t in til2:
str+=t
str += "t"
for t in til3:
str+=t
self.writeData(str,page)
def writeData(self,context,page):
'''
将最终爬去的内容写入文件中
:param context: 匹配好的内容
:param page: 当前爬去的页码数
'''
fileName = "di" + str(page) + "yehtml.txt"
with open(fileName, "a") as file:
file.writelines(context + "n")
if __name__ == '__main__':
position=raw_input("请输入职位:")
workPlace=raw_input("请输入工作地点:")
z=zhiLian()
z.spider(position,workPlace)
爬取贴吧:
#encoding=utf-8
import urllib
import urllib2
import re
class teiba():
def spider(self,name,startPage,endPage):
url="http://tieba.baidu.com/f?ie=utf-8&"
url+=urllib.urlencode({"kw":name})
for page in range(startPage,endPage+1):
pn=50*(page-1)
urlFull=url+"&"+urllib.urlencode({"pn":pn})
html=self.loadPage(url)
self.dealPage(html,page)
def loadPage(self,url):
header={
"User-Agent":" Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.79 Safari/537.36"
}
request=urllib2.Request(url,headers=header)
response=urllib2.urlopen(request)
html=response.read()
return html
def dealPage(self,html,page):
partten=re.compile(r'(.*?)',re.S)
titleList=partten.findall(html)
rstr=r'#(.*?)#'
for title in titleList:
title=re.sub(rstr,"",title)
self.writePage(title,page)
def writePage(self,context,page):
fileName="di"+str(page)+"yehtml.txt"
with open(fileName,"a") as file:
file.writelines(context+"n")
if __name__ == '__main__':
name=raw_input("请输入贴吧名:")
startPage=raw_input("请输入起始页:")
endPage=raw_input("请输入终止页:")
t=teiba()
t.spider(name,int(startPage),int(endPage))