Clean DB

Python

Public Domain

Sanitizes a database

Download (right click, save as, rename as appropriate)

Embed

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
#!/usr/bin/python

import MySQLdb
import re
from BeautifulSoup import *
import time


t1 = time.clock()

db = MySQLdb.connect(host="localhost", user="*****", passwd="*****", db="*****")
cursor = db.cursor(MySQLdb.cursors.DictCursor)


reg = {} #ugly stuff even BeautifulSoup doesn't handle well
reg['fonttags']          = re.compile('<\/{0,1}font[^>]*>', re.IGNORECASE)
reg['mswordcruft']       = re.compile('<\/{0,1}[vwo]:[^>]*>.*\n', re.IGNORECASE)
reg['mswordcomments']    = re.compile('<!-{0,2}\[[^>]*>', re.MULTILINE)
reg['fontsize']          = re.compile('font-size:[^;"]pt;{0,1}', re.IGNORECASE)
reg['fontfamily']        = re.compile('font-family:[^;"];{0,1}', re.IGNORECASE)
reg['msostuff']          = re.compile('mso-:[^;"];{0,1}', re.IGNORECASE)
reg['msoclasses']        = re.compile(' class="MsoNormal"', re.IGNORECASE)
reg['xmltags']           = re.compile('<\/{0,1}xml>', re.IGNORECASE)
reg['linktags']          = re.compile('<\/{0,1}link[^>]*>', re.IGNORECASE)
reg['linebreaks']        = re.compile('^\s*$\n\n', re.IGNORECASE)


def escapehtml(html, row):
    for i in reg:
        html = re.sub(reg[i], '', html)
    try:
        soup = BeautifulSoup(html)
        junktags = soup.findAll(['style', 'meta', ])
        for i in junktags:
            i.extract()
        html = soup.prettify()
    except:
        print row
    html = db.escape_string(html)
    return html

fields = [1,2,5,6,20] # because expressionengine has non-human-readable schema that has to be hacked around.
for i in fields:
    fields[fields.index(i)] = 'field_id_' + str(i)

cursor.execute("SELECT " + ','.join(fields) + ",entry_id FROM exp_weblog_data WHERE site_id=1")

result = cursor.fetchall()

for i in result:
    for j in fields:
        cursor.execute("UPDATE exp_weblog_data SET %s='%s' WHERE entry_id=%s" % (str(j), escapehtml(i[j], str(i['entry_id'])), str(i['entry_id'])))
        #print escapehtml(i[j]) #debugging

cursor.close() 

t2 = time.clock()

print t2 - t1 #reports time, that's all