#!/usr/bin/python
import MySQLdb
import re
from BeautifulSoup import *
import time
t1 = time.clock()
db = MySQLdb.connect(host="localhost", user="*****", passwd="*****", db="*****")
cursor = db.cursor(MySQLdb.cursors.DictCursor)
reg = {} #ugly stuff even BeautifulSoup doesn't handle well
reg['fonttags'] = re.compile('<\/{0,1}font[^>]*>', re.IGNORECASE)
reg['mswordcruft'] = re.compile('<\/{0,1}[vwo]:[^>]*>.*\n', re.IGNORECASE)
reg['mswordcomments'] = re.compile('<!-{0,2}\[[^>]*>', re.MULTILINE)
reg['fontsize'] = re.compile('font-size:[^;"]pt;{0,1}', re.IGNORECASE)
reg['fontfamily'] = re.compile('font-family:[^;"];{0,1}', re.IGNORECASE)
reg['msostuff'] = re.compile('mso-:[^;"];{0,1}', re.IGNORECASE)
reg['msoclasses'] = re.compile(' class="MsoNormal"', re.IGNORECASE)
reg['xmltags'] = re.compile('<\/{0,1}xml>', re.IGNORECASE)
reg['linktags'] = re.compile('<\/{0,1}link[^>]*>', re.IGNORECASE)
reg['linebreaks'] = re.compile('^\s*$\n\n', re.IGNORECASE)
def escapehtml(html, row):
for i in reg:
html = re.sub(reg[i], '', html)
try:
soup = BeautifulSoup(html)
junktags = soup.findAll(['style', 'meta', ])
for i in junktags:
i.extract()
html = soup.prettify()
except:
print row
html = db.escape_string(html)
return html
fields = [1,2,5,6,20] # because expressionengine has non-human-readable schema that has to be hacked around.
for i in fields:
fields[fields.index(i)] = 'field_id_' + str(i)
cursor.execute("SELECT " + ','.join(fields) + ",entry_id FROM exp_weblog_data WHERE site_id=1")
result = cursor.fetchall()
for i in result:
for j in fields:
cursor.execute("UPDATE exp_weblog_data SET %s='%s' WHERE entry_id=%s" % (str(j), escapehtml(i[j], str(i['entry_id'])), str(i['entry_id'])))
#print escapehtml(i[j]) #debugging
cursor.close()
t2 = time.clock()
print t2 - t1 #reports time, that's all