Fix Broken Links

Python

Public Domain

Updates redirected links, reports 404s. Beware, this is sometimes interpreted as a malicious robot, so it's important to check the output.

Download (right click, save as, rename as appropriate)

Embed

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
#!/usr/bin/python

import MySQLdb
import re
import string
from BeautifulSoup import *
import time
import urllib2
import socket

socket.setdefaulttimeout(10) # Prevents really long timeouts

t1 = time.clock() #records start time

db = MySQLdb.connect(host="************", user="*******", passwd="*************", db="*******")
cursor = db.cursor(MySQLdb.cursors.DictCursor)
externallink = re.compile('^http', re.IGNORECASE)
mailtolink = re.compile('^mailto', re.IGNORECASE)
anchorname = re.compile('^#', re.IGNORECASE)
spaces = re.compile('%20')

def fixlinks(html,title,entry_id):
    soup = BeautifulSoup(html)
    links = soup.findAll('a')
    urls = []
    returnlist = {'fail':[], 'redirect':[]}

    for i in links:
        try:
            if not (re.match(mailtolink, i['href'])) and not (re.match(anchorname, i['href'])) and (re.match(externallink, i['href'])):
                try:
                    print "\ttrying",i['href']
                    i['href'] = re.sub(spaces, '', i['href']) #random %20s in links
                    foo = urllib2.urlopen(i['href'])
                    if (i['href'] != foo.geturl()): #update redirected links and log them
                        returnlist['redirect'].append((str(i['href']), str(foo.geturl())))
                        i['href'] = foo.geturl()    
                        print "\t\tredirected"
                except urllib2.HTTPError:
                    returnlist['fail'].append(i['href']) #log 404s
                    print "\t\t404 error"
        except KeyError: #ignore anchor links w/o href attributes (e.g. name links, etc.)
            pass
            
    return [title, entry_id, db.escape_string(str(soup)), returnlist]
        
fields = [304,]
for i in fields: #Normally, the above list of arbitrary field names is fine. However, expressionengine has an annoying convention for "custom" field names.
    fields[fields.index(i)] = 'field_id_' + str(i)

cursor.execute("SELECT %s,exp_weblog_data.entry_id,title FROM exp_weblog_data JOIN exp_weblog_titles on exp_weblog_data.entry_id = exp_weblog_titles.entry_id WHERE exp_weblog_data.site_id=8 AND field_id_304 <> '' ORDER BY exp_weblog_data.entry_id" % (','.join(fields),))

result = cursor.fetchall()

output = ""

totalrows = len(result)

print str(totalrows), "results"

criticalfailures = []

for i in result:
    for j in fields:
        try:
            print str(i['entry_id']) + ":"
            totalrows = totalrows - 1
            warnings = fixlinks(i[j],i['title'],i['entry_id'])
            cursor.execute("UPDATE exp_weblog_data SET %s='%s' WHERE entry_id=%s" % (j, warnings[2], warnings[1]))
            print "row %s updated, %s to go\n" %(i['entry_id'], totalrows)
            output += "%s (%s):\n" %(warnings[0],warnings[1])
            if len(warnings[3]['fail']):
                output += "\t404 Errors:\n"
                for i in warnings[3]['fail']:
                    output += "\t\t%s\n" %(i)
            if len(warnings[3]['redirect']):
                output += "\tRedirects:\n"
                for i in warnings[3]['redirect']:
                    output += "\t\t%s => %s\n" %(i[0],i[1])

        except Exception, msg: #catch unexpected errors
            string = str(i['entry_id']) + ": " + str(Exception) + " " + str(msg)
            criticalfailures.append(string)


output += "\n\n\nCritical Failures\n:"
for i in criticalfailures:
    output += i
    
f = open('log', 'w')
f.write(output)
f.close()

cursor.close() 

t2 = time.clock()


print t2 - t1