forked from csev/py4e
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathwsave.py
More file actions
108 lines (93 loc) · 3.04 KB
/
wsave.py
File metadata and controls
108 lines (93 loc) · 3.04 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
import string
import sqlite3
import urllib
import xml.etree.ElementTree as ET
from BeautifulSoup import *
conn = sqlite3.connect('wikidata.db')
cur = conn.cursor()
cur.execute('''
CREATE TABLE IF NOT EXISTS TinyTable (id INTEGER PRIMARY KEY,
url TEXT, page BLOB, retrieved_at timestamp)''')
# A slightly extended dictionary
class sash(dict):
def sortvalues(self,reverse=True):
return sorted(self.items(),key=lambda x: (x[1], x[0]), reverse=reverse)
def tinyTable(url):
global cur,conn
cur.execute('SELECT id,page,retrieved_at FROM TinyTable WHERE URL = ?', (url, ))
try:
row = cur.fetchone()
print 'DATE',row[2]
return row[1]
except:
row = None
print 'Retrieving', url
data = urllib.urlopen (url).read()
if row != None:
cur.execute("UPDATE TinyTable SET page=?,retrieved_at=datetime('now') WHERE id=?", (unicode(data, 'utf-8'), row[0]))
else:
cur.execute("INSERT INTO TinyTable (url, page, retrieved_at) VALUES (?, ?, datetime('now'))",(url, unicode(data, 'utf-8')))
conn.commit()
return data
cururl = 'https://ctools.umich.edu/access/wiki/site/f57681b8-6db9-46cf-aad1-3a0bdd621138/home.html'
urls = list()
urls.append(cururl)
visited = list()
editcounts = sash()
postcounts = sash()
while len(urls) > 0 :
print '=== URLS Yet To Retrieve:',len(urls)
cururl = urls.pop()
if cururl in visited: continue
print 'RETRIEVING',cururl
data = tinyTable(cururl)
visited.append(cururl)
soup = BeautifulSoup(data)
# print data[:3000]
p = re.compile('\(.*?\)')
paragraphs = soup('p')
for para in paragraphs:
try:
posters = p.findall(para.contents[0])
except:
posters = list()
for poster in posters:
poster = poster.lower()
postcounts[poster] = postcounts.get(poster,0) + 1
tags = soup('a')
# print 'Tags'
for tag in tags:
# print tag
url = tag.get('href',None)
if url == None : continue
# Don't follow absolute urls
if url.startswith('http') : continue
newurl = urllib.basejoin(cururl,url)
if newurl in visited : continue
# print 'APPENDING',newurl
urls.append(newurl)
if not cururl.endswith('.html') : continue
newurl = cururl.replace('.html','.20.rss')
if newurl in visited: continue
print 'RSS:', newurl
data = tinyTable(newurl)
visited.append(newurl)
# print data[:500]
stuff = ET.fromstring(data)
lst = stuff.findall('channel/item/description')
print 'Item count:', len(lst)
for item in lst:
dir(item)
# print 'Text', item.text
words = item.text.split()
# print words[:10]
name = words[3] + ' ' + words[4]
if words[5] != 'at' : name = name + ' ' + words[5]
# print name
editcounts[name] = editcounts.get(name, 0 ) + 1
print 'EDITS:'
for (key,val) in editcounts.sortvalues():
print key, val
for (key,val) in sorted(postcounts.items()):
print key, val
conn.close()