File tree Expand file tree Collapse file tree 2 files changed +46
-0
lines changed
Expand file tree Collapse file tree 2 files changed +46
-0
lines changed Original file line number Diff line number Diff line change 1+ import requests
2+ import re
3+ import urlparse
4+
5+ # regex
6+ email_re = re .compile (r'([\w\.,]+@[\w\.,]+\.\w+)' )
7+ link_re = re .compile (r'href="(.*?)"' )
8+
9+ def crawl (url , maxlevel ):
10+
11+ result = set ()
12+
13+ while maxlevel > 0 :
14+
15+ # Get the webpage
16+ req = requests .get (url )
17+
18+ # Check if successful
19+ if (req .status_code != 200 ):
20+ return []
21+
22+ # Find and follow all the links
23+ links = link_re .findall (req .text )
24+ for link in links :
25+ # Get an absolute URL for a link
26+ link = urlparse .urljoin (url , link )
27+
28+ # Find all emails on current page
29+ result .update (email_re .findall (req .text ))
30+
31+ print "Crawled level: {}" .format (maxlevel )
32+
33+ # new level
34+ maxlevel -= 1
35+
36+ # recurse
37+ crawl (link , maxlevel )
38+
39+ return result
40+
41+ emails = crawl ('http://www.website_goes_here_dot_com' , 2 )
42+
43+ print "\n Scrapped e-mail addresses:"
44+ for email in emails :
45+ print email
Original file line number Diff line number Diff line change 771 . ** 05_load_json_without_dupes.py** : load json, convert to dict, raise error if there is a duplicate key
881 . ** 06_execution_time.py** : class used for timing execution of code
991 . ** 07_benchmark_permissions_loading_django.py** : benchmark loading of permissions in Django
10+ 1 . ** 08_basic_email_web_crawler.py** : web crawler for grabbing emails from a website recursively
You can’t perform that action at this time.
0 commit comments