#!/usr/bin/python
import MySQLdb
import re
import string
from BeautifulSoup import *
import time
import urllib2
import socket
socket.setdefaulttimeout(10) # Prevents really long timeouts
t1 = time.clock() #records start time
db = MySQLdb.connect(host="************", user="*******", passwd="*************", db="*******")
cursor = db.cursor(MySQLdb.cursors.DictCursor)
externallink = re.compile('^http', re.IGNORECASE)
mailtolink = re.compile('^mailto', re.IGNORECASE)
anchorname = re.compile('^#', re.IGNORECASE)
spaces = re.compile('%20')
def fixlinks(html,title,entry_id):
soup = BeautifulSoup(html)
links = soup.findAll('a')
urls = []
returnlist = {'fail':[], 'redirect':[]}
for i in links:
try:
if not (re.match(mailtolink, i['href'])) and not (re.match(anchorname, i['href'])) and (re.match(externallink, i['href'])):
try:
print "\ttrying",i['href']
i['href'] = re.sub(spaces, '', i['href']) #random %20s in links
foo = urllib2.urlopen(i['href'])
if (i['href'] != foo.geturl()): #update redirected links and log them
returnlist['redirect'].append((str(i['href']), str(foo.geturl())))
i['href'] = foo.geturl()
print "\t\tredirected"
except urllib2.HTTPError:
returnlist['fail'].append(i['href']) #log 404s
print "\t\t404 error"
except KeyError: #ignore anchor links w/o href attributes (e.g. name links, etc.)
pass
return [title, entry_id, db.escape_string(str(soup)), returnlist]
fields = [304,]
for i in fields: #Normally, the above list of arbitrary field names is fine. However, expressionengine has an annoying convention for "custom" field names.
fields[fields.index(i)] = 'field_id_' + str(i)
cursor.execute("SELECT %s,exp_weblog_data.entry_id,title FROM exp_weblog_data JOIN exp_weblog_titles on exp_weblog_data.entry_id = exp_weblog_titles.entry_id WHERE exp_weblog_data.site_id=8 AND field_id_304 <> '' ORDER BY exp_weblog_data.entry_id" % (','.join(fields),))
result = cursor.fetchall()
output = ""
totalrows = len(result)
print str(totalrows), "results"
criticalfailures = []
for i in result:
for j in fields:
try:
print str(i['entry_id']) + ":"
totalrows = totalrows - 1
warnings = fixlinks(i[j],i['title'],i['entry_id'])
cursor.execute("UPDATE exp_weblog_data SET %s='%s' WHERE entry_id=%s" % (j, warnings[2], warnings[1]))
print "row %s updated, %s to go\n" %(i['entry_id'], totalrows)
output += "%s (%s):\n" %(warnings[0],warnings[1])
if len(warnings[3]['fail']):
output += "\t404 Errors:\n"
for i in warnings[3]['fail']:
output += "\t\t%s\n" %(i)
if len(warnings[3]['redirect']):
output += "\tRedirects:\n"
for i in warnings[3]['redirect']:
output += "\t\t%s => %s\n" %(i[0],i[1])
except Exception, msg: #catch unexpected errors
string = str(i['entry_id']) + ": " + str(Exception) + " " + str(msg)
criticalfailures.append(string)
output += "\n\n\nCritical Failures\n:"
for i in criticalfailures:
output += i
f = open('log', 'w')
f.write(output)
f.close()
cursor.close()
t2 = time.clock()
print t2 - t1