Python爬虫爬取ok资源网电影播放地址

#爬取ok资源网电影播放地址#www.okzy.co
#入口一:http://okzy.co/index.php?m=vod-search&wd={关键字}&submit=search
#入口二:http://www.okzy.co/?m=vod-type-id-{1-34}.html
#       http://www.okzy.co/?m=vod-index-pg-{1-1110}.html# for x in range(1110):
# 	print("http://www.okzy.co/?m=vod-index-pg-{}.html".format(x))#请求,响应,分析保存
#目标入口:首页->列表->子页面->内容(播放地址,对应名称)->保存(电影标题)import requests
from lxml import etree
#表格模块
#pip install prettytable
from prettytable import PrettyTablehost = "http://www.okzy.co"
rooturl = "/?m=vod-index-pg-{}.html".format(1)#请求入口页
response = requests.get(host+rooturl)
#输出页面内容-HTML
response.encoding = 'utf-8'
# print(response.text)
if response.status_code==200:#请求成功,可以爬取print("==========爬虫工作开始==========")page_index = response.text#将文本转为xpathpage_index_xp = etree.HTML(page_index)#以下为xpath的解析规则#电影名称,分类,更新日期,对应的详情页链接入口#电影名称page_index_xp_title = page_index_xp.xpath("//div[@class='xing_vb']/ul/li/span[@class='xing_vb4']/a/text()")#详情页链接page_index_xp_titleurl = page_index_xp.xpath("//div[@class='xing_vb']/ul/li/span[@class='xing_vb4']/a/@href")# #分类# page_index_xp_leibie = page_index_xp.xpath("//div[@class='xing_vb']/ul/li/span[@class='xing_vb5']/text()")# #更新日期# page_index_xp_date = page_index_xp.xpath("//div[@class='xing_vb']/ul/li/span[@class='xing_vb6']/text()")# print(page_index_xp_title)# print(page_index_xp_titleurl)# print(page_index_xp_leibie)# print(page_index_xp_date)#解析数据,并拼装URLpage_index_num = 0for p_i_title in page_index_xp_title:p_i_titleurl = host+page_index_xp_titleurl[page_index_num]# strip():去掉头尾转移、空白、换行符# lstrip():去掉开头的# rstrip():去掉尾部的# replace("oldstr","newstr"):替换# print("标题:{},地址{}".format(p_i_title.strip(),p_i_titleurl))#制作表格table_index = PrettyTable(['《{}》'.format(p_i_title.strip())])table_info = PrettyTable(['资源名称','播放地址'])#业务判断-去重page_index_num+=1#二层访问和解析#详细信息,播放地址(集)page_sec_info = requests.get(p_i_titleurl)if page_sec_info.status_code==200:#详情页访问正常page_sec = page_sec_info.textpage_sec_xp = etree.HTML(page_sec)#播放地址/封面/page_sec_xp_playurl = page_sec_xp.xpath("//div[@class='vodplayinfo']//ul/li/text()")# print(page_sec_xp_playurl)#解析地址和对应的别名for piurl in page_sec_xp_playurl:# print(piurl.split("$")[0])# print(piurl.split("$")[1])piname = piurl.split("$")[0]pilink = piurl.split("$")[1]# print("播放地址为:{}-{}".format(piname,pilink))table_info.add_row([piname,pilink])#存储:文本/sqlite#拼接为代码htmltemp = '''<video autoplay="autoplay" controls="controls" src="{}"></video>'''.format(pilink)#动态生成一个HTML文件table_index.add_row([table_info])print(table_index)else:print("==========爬虫不能正确工作,原因:{}==========".format(response.status_code))

Published by

风君子

独自遨游何稽首 揭天掀地慰生平