forked from paulproteus/python-scraping-code-samples
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathhtml5lib_parse.py
More file actions
34 lines (29 loc) · 1002 Bytes
/
html5lib_parse.py
File metadata and controls
34 lines (29 loc) · 1002 Bytes
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
# Built-in tree generator
import html5lib
import urllib2
def make_native_tree():
fd = urllib2.urlopen('http://mehfilindian.com/LunchMenuTakeOut.htm')
parser = html5lib.HTMLParser()
document = parser.parse(f)
return document
# If you want a specific tree format
# minidom
import html5lib
from html5lib import treebuilders
import urllib2
def make_dom():
fd = urllib2.urlopen('http://mehfilindian.com/LunchMenuTakeOut.htm')
parser = html5lib.HTMLParser(tree=treebuilders.getTreeBuilder("dom"))
minidom_document = parser.parse(fd)
return minidom_document
# BeautifulSoup
import html5lib
from html5lib import treebuilders
import urllib2
def make_soup():
fd = urllib2.urlopen('http://mehfilindian.com/LunchMenuTakeOut.htm')
parser = html5lib.HTMLParser(tree=treebuilders.getTreeBuilder("beautifulsoup"))
minidom_document = parser.parse(fd)
return minidom_document
# More info: http://code.google.com/p/html5lib/wiki/UserDocumentation
make_tree = make_dom