forked from csev/py4e
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathurlforever.py
More file actions
33 lines (28 loc) · 845 Bytes
/
urlforever.py
File metadata and controls
33 lines (28 loc) · 845 Bytes
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
import urllib
from BeautifulSoup import *
print "Warning - this program wil likely run forever'
url = raw_input('Enter starting point - ')
urls = list()
urls.append(url)
done = list()
while len(urls) > 0 :
print "================ We have ",len(urls)," Urls ======="
url = urls.pop()
print "========= Retrieveing URL = ",url
done.append(url)
try:
html = urllib.urlopen(url).read()
except:
print "Failed to retrieve",url
continue
print "Length",len(html)
soup = BeautifulSoup(html)
# Retrieve all of the anchor tags
tags = soup('a')
print "Link count:",len(tags)
for tag in tags:
link = tag.get('href', None)
if link is None : continue
if not link.startswith('http:') : continue
print link
if link not in done: urls.append(link)