X Tutup
#coding=utf-8 import urllib2 import re import os import socket def pachong(url , thref , ttitle): "我的第一个爬虫程序 url 是网页路径 , href 是返回的连接 , title 是标题" try: htmlfile = open("firstfile","a") except IOError: print "Error: 没有找到文件,或者读取文件失败" return '1' else: #print "建立文件成功" try: # url = input("please input a url\n") response = urllib2.urlopen(url, timeout = 10) #获取网页数据 except urllib2.URLError, e: if isinstance(e.reason, socket.timeout): print("Restart"); return '2' else: #如果有错打印错误 return '1' else: try: tem = response.read() except IOError: print "Error: 读取错误或者文件错误" return '1' else: print("firstfile") patter = re.compile(r'') nextPatter= re.compile(r'') nextlink = re.compile(r'.+?href="(?P.+?)".+') #patter = re.compile(r'.+?)".+title="(?P.+?)".+</a>') pattern= re.compile(r'.+?href="(?P<href>.+?)".+title="(?P<title>.+?)".+') Nextpage = nextPatter.findall(tem) #print Nextpage Nextlink = nextlink.search(Nextpage[0]) os.environ['var']=str(Nextlink.group(1)) os.system('echo $var > NextLink') print Nextlink.group(1) match = patter.findall(tem) for var in match: # print(var.decode('utf-8')) tmatch = pattern.search(var) thref.append(tmatch.group(1).decode('utf-8')) ttitle.append(tmatch.group(2).decode('utf-8')) htmlfile.write(tmatch.group(1)+'\n') htmlfile.write(tmatch.group(2)+'\n') htmlfile.close() return Nextlink.group(1) refile = open('NextLink',"r+") url = refile.read() title = [] href = [] while True: nexturl = pachong(url , href , title) if (nexturl == '1'): print("ERROR") elif(nexturl == '2'): url = refile.read() elif(nexturl == ''): break; else: url = nexturl
X Tutup