pythonlearn/old/old-code/wsave.py at text-formatting · StudyCourse/pythonlearn

History

108 lines (93 loc) · 3.04 KB

Raw

100

101

102

103

104

105

106

107

108

import string

import sqlite3

import urllib

import xml.etree.ElementTree as ET

from BeautifulSoup import *

conn = sqlite3.connect('wikidata.db')

cur = conn.cursor()

cur.execute('''

CREATE TABLE IF NOT EXISTS TinyTable (id INTEGER PRIMARY KEY,

url TEXT, page BLOB, retrieved_at timestamp)''')

# A slightly extended dictionary

class sash(dict):

def sortvalues(self,reverse=True):

return sorted(self.items(),key=lambda x: (x[1], x[0]), reverse=reverse)

def tinyTable(url):

global cur,conn

cur.execute('SELECT id,page,retrieved_at FROM TinyTable WHERE URL = ?', (url, ))

try:

row = cur.fetchone()

print 'DATE',row[2]

return row[1]

except:

row = None

print 'Retrieving', url

data = urllib.urlopen (url).read()

if row != None:

cur.execute("UPDATE TinyTable SET page=?,retrieved_at=datetime('now') WHERE id=?", (unicode(data, 'utf-8'), row[0]))

else:

cur.execute("INSERT INTO TinyTable (url, page, retrieved_at) VALUES (?, ?, datetime('now'))",(url, unicode(data, 'utf-8')))

conn.commit()

return data

cururl = 'https://ctools.umich.edu/access/wiki/site/f57681b8-6db9-46cf-aad1-3a0bdd621138/home.html'

urls = list()

urls.append(cururl)

visited = list()

editcounts = sash()

postcounts = sash()

while len(urls) > 0 :

print '=== URLS Yet To Retrieve:',len(urls)

cururl = urls.pop()

if cururl in visited: continue

print 'RETRIEVING',cururl

data = tinyTable(cururl)

visited.append(cururl)

soup = BeautifulSoup(data)

# print data[:3000]

p = re.compile('\(.*?\)')

paragraphs = soup('p')

for para in paragraphs:

try:

posters = p.findall(para.contents[0])

except:

posters = list()

for poster in posters:

poster = poster.lower()

postcounts[poster] = postcounts.get(poster,0) + 1

tags = soup('a')

# print 'Tags'

for tag in tags:

# print tag

url = tag.get('href',None)

if url == None : continue

# Don't follow absolute urls

if url.startswith('http') : continue

newurl = urllib.basejoin(cururl,url)

if newurl in visited : continue

# print 'APPENDING',newurl

urls.append(newurl)

if not cururl.endswith('.html') : continue

newurl = cururl.replace('.html','.20.rss')

if newurl in visited: continue

print 'RSS:', newurl

data = tinyTable(newurl)

visited.append(newurl)

# print data[:500]

stuff = ET.fromstring(data)

lst = stuff.findall('channel/item/description')

print 'Item count:', len(lst)

for item in lst:

dir(item)

# print 'Text', item.text

words = item.text.split()

# print words[:10]

name = words[3] + ' ' + words[4]

if words[5] != 'at' : name = name + ' ' + words[5]

# print name

editcounts[name] = editcounts.get(name, 0 ) + 1

print 'EDITS:'

for (key,val) in editcounts.sortvalues():

print key, val

for (key,val) in sorted(postcounts.items()):

print key, val

conn.close()

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

FilesExpand file tree

wsave.py

Latest commit

History

wsave.py

File metadata and controls