中国计算机学会通讯下载工具（简易爬虫）

CCCF

《中国计算机学会通讯》月刊（Communications of the CCF, 简称CCCF）由中国计算机学会主办，高等教育出版社出版，面向计算机专业人士及信息领域的相关人士。杂志利用学会的学术优势，组织信息技术各个领域最有影响的专家撰稿，全面、宏观介绍计算机科学技术发展的最新发展状况，预测未来技术发展趋势，可以帮助读者更加开阔视野，了解IT最前沿的动态，把握IT发展方向，具有权威性和指导性，适合与计算机相关的科研、教学，以及产业和管理等各方面的人士阅读。

地址：http://www.ccf.org.cn/sites/ccf/zgjsjxhtx.jsp

下载问题

先来看下下载过程，进入CCCF的页面之后，如果我们想下载某一期通讯下边的文章，就要点击期刊》点击标题》点击下载》然后就会得到一个类似于“0.pdf”,“1.pdf”的文件。

假设一期通讯中有15篇，要下载全部文章的话，要点击几十次鼠标，下载之后的文件名也都是数字，如果要逐个修改的话，也要花费一定时间。

解决方案（2个程序）：

（1）提供期刊id，自动下载该期刊下的全部文章；

GitHub：https://github.com/cheesezhe/ccf_crawler （里边有帮助文档）

（2）自动下载全部期刊的全部文章，直接运行源代码就行；

下载地址：http://pan.baidu.com/s/1gdWOTt5 密码：6212

源代码：

  1 #!/usr/bin/env python
  2 #-*-coding:utf-8-*-
  3 __author__ = 'ZhangHe'
  4 import urllib2,re,os,httplib,urllib
  5 
  6 
  7 def download_by_paper_url(src_url, dest_file):
  8     """
  9     根据paper链接src_url下载文件并保存为dest_file
 10     :param src_url:
 11     :param dest_file:
 12     :return:
 13     """
 14     f = urllib2.urlopen(src_url)
 15     try:
 16         data = f.read()
 17     except httplib.IncompleteRead as e:
 18         with open('err_log.txt','a+') as err:#错误日志信息
 19             err.write("%s %s\n"%(src_url,err))
 20         print 'Error'
 21         return -1
 22     with open(dest_file, "wb") as code:
 23         code.write(data)
 24 
 25 
 26 def parse_data_from_journal_url(src_url):
 27     """
 28     根据期刊链接获取paper的urls，titles和期刊名字
 29     :param src_url:
 30     :return:[paper_urls, paper_titles, journal_name]
 31     """
 32     request = urllib2.Request(src_url)
 33     response = urllib2.urlopen(request)
 34     content = response.read().decode('utf-8')
 35 
 36     print 'parsing paper IDs...'
 37     pattern_str1 = '<a target=.*?title=.*?href=.*?contentId=(.*?)">'
 38     pattern_str2 = '<span id=.*?class="cfqwz">(.*?)</span>'
 39     pattern_str3 = '<title>(.*?)-.*?</title>'
 40     pattern1 = re.compile(pattern_str1, re.S)
 41     pattern2 = re.compile(pattern_str2, re.S)
 42     pattern3 = re.compile(pattern_str3, re.S)
 43     ids = re.findall(pattern1, content)
 44     titles = re.findall(pattern2, content)
 45     name = re.findall(pattern3, content)
 46 
 47     return [ids, titles, name[0].strip()]
 48 
 49 
 50 def get_url_by_paper_id(id):
 51     """
 52     根据paperid获取下载链接
 53     :param src_url:
 54     :return:
 55     """
 56     src_url = 'http://www.ccf.org.cn/sites/ccf/freexiazai.jsp?contentId='+str(id)
 57     request = urllib2.Request(src_url)
 58     response = urllib2.urlopen(request)
 59     content = response.read().decode('utf-8')
 60 
 61     pattern_str = 'class=""><a href="(.*?)">.*?</a></span>'
 62     pattern = re.compile(pattern_str, re.S)
 63     urls =  re.findall(pattern, content)
 64     #
 65     #If there is no url, return -1
 66     if len(urls) < 1:
 67         return -1
 68     #
 69     #process Chinese words in url
 70     tmps = urls[0].split('/')
 71     l = len(tmps)
 72     tmps[l-1] = urllib.quote(tmps[l-1].encode('utf-8'))
 73     tmp = ''
 74     #
 75     # or tmp = '/'.join(tmps)
 76     for i in tmps:
 77         tmp += '/'+i
 78     return 'http://www.ccf.org.cn/sites/ccf/download.jsp?file='+tmp
 79 
 80 
 81 def get_all_journals_ids():
 82     """
 83     获取所有期刊对应的的id
 84     """
 85     urls = [
 86         'http://www.ccf.org.cn/sites/ccf/zgjsjxhtx.jsp?jportal=SFXxdDjYKXLl06cz1fxjkzihsqP9JcoP',#89-118期
 87         'http://www.ccf.org.cn/sites/ccf/zgjsjxhtx.jsp?jportal=SFXxdDjYKXLl06cz1fxjk%2FySA9FzIG2g',#59-88期
 88         'http://www.ccf.org.cn/sites/ccf/zgjsjxhtx.jsp?jportal=SFXxdDjYKXLl06cz1fxjk7R3hW0kV5Np',#29-58期
 89         'http://www.ccf.org.cn/sites/ccf/zgjsjxhtx.jsp?jportal=SFXxdDjYKXLl06cz1fxjk%2BP28%2Bg%2BBW1u'#01-28期
 90     ]
 91     res = []
 92 
 93     for src_url in urls:
 94         print 'processing\t'+src_url
 95         request = urllib2.Request(src_url)
 96         response = urllib2.urlopen(request)
 97         content = response.read().decode('utf-8')
 98 
 99         pattern_str = '<li id="(.*?)">.*?<a target='
100         pattern = re.compile(pattern_str, re.S)
101         ids = re.findall(pattern, content)
102         res.extend(ids)
103     return res
104 
105 
106 def get_all_done_papers_ids():
107     """
108     获取所有已下载文章的id列表
109     :return:
110     """
111     dl_ids = []
112     with open('dl_list.txt','r') as dl:#已下载文章id
113         for i in dl:
114             dl_ids.append(i.strip())
115     return dl_ids
116 
117 
118 def get_all_done_journals_ids():
119     """
120     获取全部已下载期刊对应的id列表
121     :return:
122     """
123     dl_j = []
124     with open('dl_list_j.txt','r') as dl:#已下载期刊id
125         for i in dl:
126             dl_j.append(i.strip())
127     return dl_j
128 
129 
130 def create_new_directory(dir_name):
131     """
132     创建一个文件夹，文件夹名为dir_name
133     :param dir_name:
134     :return:
135     """
136     try:
137         os.mkdir(dir_name)
138     except WindowsError as e:
139         pass
140 
141 
142 def get_paper_title(origin_title):
143     """
144     格式化文章标题
145     :param origin_title:
146     :return ret:
147     """
148     ret = origin_title.strip()
149     ret = ret.replace('/','-')
150     ret = ret.replace('?','')
151     ret = ret.replace('*','_x_')
152     return ret
153 
154 if __name__ == '__main__':
155     """
156     Step 1:获取期刊id列表，已下载期刊id列表，已下载文章id列表
157     """
158     all_journals_ids = get_all_journals_ids()
159     all_done_journals_ids = get_all_done_journals_ids()
160     all_done_papers_ids = get_all_done_papers_ids()
161 
162     """
163     Step 2:遍历期刊id列表，逐个处理
164     """
165     for journal_id in all_journals_ids:
166         #
167         #如果已下载当前期刊，则跳过
168         if journal_id in all_done_journals_ids:
169             print '%s has been downloaded.'%(journal_id)
170             continue
171         #
172         #根据期刊id，获取解析数据ret_data = [文章id列表，文章标题列表，期刊名]
173         journal_url = 'http://www.ccf.org.cn/sites/ccf/jsjtbbd.jsp?contentId='+journal_id
174         ret_data = parse_data_from_journal_url(journal_url)
175         print 'Start Download %s\t %s' % (journal_id, ret_data[2])
176         #
177         #根据期刊名创建目录
178         create_new_directory(ret_data[2])
179         finished = 0
180         """
181         Step 3:遍历ret_data中的文章id列表，逐个处理
182         """
183         for idx in xrange(len(ret_data[0])):
184             paper_id = ret_data[0][idx]
185             #
186             #如果文章paper_id已下载，则跳过
187             if paper_id in all_done_papers_ids:
188                 print 'Paper %s has been downloaded.' % paper_id
189                 finished += 1
190                 continue
191             #
192             #根据paper_id获得下载链接
193             title = get_paper_title(ret_data[1][idx])
194             print 'Downloading (%s/%s) ID:%s Title:%s' % (str(idx+1), str(len(ret_data[0])), paper_id, title)
195             target_url = get_url_by_paper_id(paper_id)
196             #
197             # if target_url is -1, it means there is no url
198             # for paper_id(this is very special situation)
199             if target_url == -1:
200                 print 'There is no url for paper %s' % paper_id
201                 finished += 1
202                 continue
203             """
204             Step 4:根据下载链接，下载文件
205             """
206             dl_result = download_by_paper_url(target_url, ret_data[2]+'\\'+title+'.pdf')
207             if dl_result != -1:
208                 finished += 1
209                 with open('dl_list.txt', 'a+') as dl:#存储已下载的文章的id
210                     dl.write(paper_id+'\n')
211             else:
212                 with open('err_list.txt', 'a+') as err:#存储下载失败的期刊id 文章id
213                     err.write(journal_id+' '+paper_id+'\n')
214         if finished == len(ret_data[0]):
215             with open('dl_list_j.txt', 'a+') as dl:#存储已下载期刊的id
216                 dl.write(journal_id+'\n')
217     print 'All finished.'

中国计算机学会通讯下载工具（简易爬虫）

Published by

风君子

发表回复取消回复

Published by

风君子

发表回复 取消回复

发表回复取消回复