import re
import sqlite3
#----------------setup database
conn = sqlite3.connect('h:\\Python\\ReadWord\\readword.sqlite')
c = conn.cursor()
# Do this instead
#t = (symbol,)
c.execute('''SELECT * FROM uniquephrase WHERE match_text_sum LIKE "a%" ''')
for row in c:
waitForInput = input('--> ')
print(row)
# Save (commit) the changes
#conn.commit()
# Close db before too many connections
c.close()
Thursday, May 24, 2012
Select from SQLite Database using Python return records that start with a Character
Friday, May 18, 2012
Compare Words in Two Files All Matches and Store in an Sqlite Database using Python
import re
import sqlite3
COMMONWORDS=['the',
'be',
'to',
'of',
'and',
'a',
'in',
'that',
'have',
'I',
'it',
'for',
'not',
'on',
'with',
'he',
'as',
'you',
'do',
'at',
'this',
'but',
'his',
'by',
'from',
'they',
'we',
'say',
'her',
'she',
'or',
'an',
'will',
'my',
'one',
'all',
'would',
'there',
'their',
'what',
'so',
'up',
'out',
'if',
'about',
'who',
'get',
'which',
'go',
'me',
'when',
'make',
'can',
'like',
'time',
'no',
'just',
'him',
'know',
'take',
'person',
'into',
'year',
'your',
'good',
'some',
'could',
'them',
'see',
'other',
'than',
'then',
'now',
'look',
'only',
'come',
'its',
'over',
'think',
'also',
'back',
'after',
'use',
'two',
'how',
'our',
'work',
'first',
'well',
'way',
'even',
'new',
'want',
'because',
'any',
'these',
'give',
'day',
'most',
'us']
#----------------setup database
conn = sqlite3.connect('h:\\Python\\ReadWord\\readword.sqlite')
#for pythonanywhere
#conn = sqlite3.connect('/home/torontoomar/readword3.sqlite')
c = conn.cursor()
# Create table matchtext
c.execute('''DROP TABLE IF EXISTS matchtext''')
c.execute('''create table matchtext(ID INTEGER PRIMARY KEY, match_text text, length_text real,
file1_position real, file2_position real,
file1_pad text, file2_pad text, match_text_sum text, length_sum_text real, uniquephraseID INTEGER)''')
# Create table unique_phrase
c.execute('''DROP TABLE IF EXISTS uniquephrase''')
c.execute('''create table uniquephrase(ID INTEGER PRIMARY KEY, match_text_sum text, length_sum_text real, numtimes real)''')
# Save (commit) the changes
conn.commit()
#----------------end setup database
def checkIfInDatabase(match_text_sum,length_sum_text ):
tuple_to_select = (match_text_sum,)
c.execute('''select * from uniquephrase where match_text_sum=?''', tuple_to_select )
#need to fetchone since can't rely on c.rowcount
cursorRow = c.fetchone()
lastrowid=1
if cursorRow is not None:
cursorRowId=cursorRow[0]
lastrowid=cursorRowId
print('id:',cursorRowId)
cursorNumTimes = cursorRow[3]
print('numTimes:',cursorNumTimes)
cursorNumTimes +=1
tuple_to_update = (cursorNumTimes, cursorRowId)
print(cursorRowId)
c.execute('''update uniquephrase set numtimes=? where ID=?''', tuple_to_update )
conn.commit()
else:
tuple_to_insert =(match_text_sum,length_sum_text,1)
c.execute('''insert into uniquephrase(match_text_sum, length_sum_text, numtimes) values (?,?,?)''', tuple_to_insert)
conn.commit()
#only works for insert statement lastrowid
lastrowid=c.lastrowid
return lastrowid
def addToDatabase(match_text, length_text, file1_position,file2_position, file1_pad, file2_pad, diagonalArray):
# go through common words an only pick ones that don't exist
# if all removed, then special case?
newDiagonalArray=[]
for matchWord in diagonalArray:
uniqueWord=True
for commonWord in COMMONWORDS:
if matchWord == commonWord:
uniqueWord=False
if uniqueWord==True:
newDiagonalArray.append(matchWord)
length_sum_text=len(newDiagonalArray)
match_text_sum =" ".join(newDiagonalArray)
#check if in database, if not add as a unique item
lastrowid=checkIfInDatabase(match_text_sum, length_sum_text)
# id is auto increment if omit, but must name columns
tuple_to_insert = (match_text, length_text, file1_position,file2_position, file1_pad, file2_pad,match_text_sum, length_sum_text,lastrowid)
c.execute('''insert into matchtext(match_text, length_text,
file1_position, file2_position,file1_pad,
file2_pad, match_text_sum, length_sum_text, uniquephraseID) values (?,?,?,?,?,?,?,?,?)''', tuple_to_insert)
conn.commit()
#for pythonanywhere
#file1 = open('/home/torontoomar/file1.txt', 'r')
file1 = open('h:\\Python\\ReadFile\\file1.txt', 'r')
file1Text = file1.read()
file1.close()
#for pythonanywhere
#file2 = open('/home/torontoomar/file2.txt', 'r')
file2 = open('h:\\Python\\ReadFile\\file2.txt', 'r')
file2Text = file2.read()
file2.close()
file1Text=file1Text.lower()
file2Text=file2Text.lower()
file1List=file1Text.splitlines()
file2List=file2Text.splitlines()
file1Text=" ".join(file1List)
file2Text=" ".join(file2List)
#clean up file1Text and file2Text , and .
file1TextArray = re.findall(r"\w+", file1Text)
file2TextArray = re.findall(r"\w+",file2Text)
len_file1=len(file1TextArray)
len_file2=len(file2TextArray)
diagonal_size=(len_file1+len_file2)-1
currentDiagonalArray=[]
MAX_DIAGONAL_SIZE=3
PAD_SPACE=30
currentDiagonalLen=0
maxrow=len_file2
maxcol=len_file1
print("1,1----------------------------")
#----------------------------------
#do case 1,1
if maxrow==maxcol:
total_range=maxrow
elif maxrow > maxcol:
total_range=maxcol
elif maxrow < maxcol:
total_range=maxrow
file1_index=0
file2_index=0
for i in range(total_range):
#while (1):
file1Word = file1TextArray[file1_index]
file2Word = file2TextArray[file2_index]
#print(file1_index,":",file2_index)
#print(file1Word," ",file2Word)
#-----add to database-------
if file1Word==file2Word:
#http://wiki.python.org/moin/PythonSpeed/PerformanceTips
currentDiagonalArray.append(file1Word)
else:
if len(currentDiagonalArray) > MAX_DIAGONAL_SIZE:
currentDiagonalStr=" ".join(currentDiagonalArray)
print(len(currentDiagonalStr),currentDiagonalStr," : ", file1_index , " , ", file2_index)
#---------------------------------------------------------
padFile1Before=file1_index-PAD_SPACE
padFile1After=file1_index+PAD_SPACE
padFile2Before=file2_index-PAD_SPACE
padFile2After=file2_index+PAD_SPACE
quote_file1Array=file1TextArray[padFile1Before:padFile1After]
quote_file2Array=file2TextArray[padFile2Before:padFile2After]
quote_file1Str=" ".join(quote_file1Array)
quote_file2Str=" ".join(quote_file2Array)
print("q: ",quote_file1Str)
print("b: ",quote_file2Str)
print("---------------------------------")
#insert into database and commit
match_text=currentDiagonalStr
length_text=len(currentDiagonalStr)
file1_position=file1_index
file2_position=file2_index
file1_pad=quote_file1Str
file2_pad=quote_file2Str
addToDatabase(match_text, length_text, file1_position,file2_position, file1_pad, file2_pad, currentDiagonalArray)
#---------------------------------------
currentDiagonalArray=[]
currentDiagonalLen=0
#-----end add to database------
#increment downwards
file1_index+=1
file2_index+=1
# C---------------------------
##need to clean up
## DUMP
print("above----------------------------")
#----------------------------------
# do case minrow,column from 2 to maxrow
# startend = (maxcol-varcol) + 1, maxcolumn
# above the line
currentDiagonalArray=[]
currentDiagonalLen=0
file1_index=1
file2_index=0
prev_file1_index=1
counter=0
#for i in range(total_range):
while (1):
counter+=1
file1Word = file1TextArray[file1_index]
file2Word = file2TextArray[file2_index]
#print(file1_index,":",file2_index)
#print(file1Word," ",file2Word)
#-----add to database-------
if file1Word==file2Word:
currentDiagonalArray.append(file1Word)
else:
if len(currentDiagonalArray) > MAX_DIAGONAL_SIZE:
currentDiagonalStr=" ".join(currentDiagonalArray)
print(len(currentDiagonalStr),currentDiagonalStr," : ", file1_index , " , ", file2_index)
#---------------------------------------------------------
padFile1Before=file1_index-PAD_SPACE
padFile1After=file1_index+PAD_SPACE
padFile2Before=file2_index-PAD_SPACE
padFile2After=file2_index+PAD_SPACE
quote_file1Array=file1TextArray[padFile1Before:padFile1After]
quote_file2Array=file2TextArray[padFile2Before:padFile2After]
quote_file1Str=" ".join(quote_file1Array)
quote_file2Str=" ".join(quote_file2Array)
print("q: ",quote_file1Str)
print("b: ",quote_file2Str)
print("---------------------------------")
#insert into database and commit
match_text=currentDiagonalStr
length_text=len(currentDiagonalStr)
file1_position=file1_index
file2_position=file2_index
file1_pad=quote_file1Str
file2_pad=quote_file2Str
addToDatabase(match_text, length_text, file1_position,file2_position, file1_pad, file2_pad, currentDiagonalArray)
#---------------------------------------
currentDiagonalArray=[]
currentDiagonalLen=0
#-----end add to database------
#increment downwards
file1_index+=1
file2_index+=1
if file2_index > (maxrow-1) or file1_index > (maxcol-1):
currentDiagonalArray=[]
#print(maxrow, "greater file2_index than maxrow-1", file2_index)
file2_index=0
prev_file1_index+=1
file1_index=prev_file1_index
if file1_index > (maxcol-1):
#print("MAXIMUM above")
break
print("counter",counter,"range",total_range)
#print("below----------------------------")
#----------------------------------
# below the line
currentDiagonalArray=[]
currentDiagonalLen=0
file1_index=0
file2_index=1
total_range = ((maxrow-1) * maxcol) - 1
prev_file2_index=1
counter=0
#for i in range(total_range):
while(1):
counter+=1
file1Word = file1TextArray[file1_index]
file2Word = file2TextArray[file2_index]
#print(file1_index,":",file2_index)
#print(file1Word," ",file2Word)
#-----add to database-------
if file1Word==file2Word:
currentDiagonalArray.append(file1Word)
currentDiagonalLen+=1
else:
if len(currentDiagonalArray) > MAX_DIAGONAL_SIZE:
currentDiagonalStr=" ".join(currentDiagonalArray)
print(len(currentDiagonalStr),currentDiagonalStr," : ", file1_index , " , ", file2_index)
#---------------------------------------------------------
padFile1Before=file1_index-PAD_SPACE
padFile1After=file1_index+PAD_SPACE
padFile2Before=file2_index-PAD_SPACE
padFile2After=file2_index+PAD_SPACE
quote_file1Array=file1TextArray[padFile1Before:padFile1After]
quote_file2Array=file2TextArray[padFile2Before:padFile2After]
quote_file1Str=" ".join(quote_file1Array)
quote_file2Str=" ".join(quote_file2Array)
print("q: ",quote_file1Str)
print("b: ",quote_file2Str)
print("---------------------------------")
#insert into database and commit
match_text=currentDiagonalStr
length_text=len(currentDiagonalStr)
file1_position=file1_index
file2_position=file2_index
file1_pad=quote_file1Str
file2_pad=quote_file2Str
addToDatabase(match_text, length_text, file1_position,file2_position, file1_pad, file2_pad, currentDiagonalArray)
#---------------------------------------
currentDiagonalArray=[]
currentDiagonalLen=0
#-----end add to database------
if(counter % 100000000 == 0):
print(total_range/counter)
#increment downwards
file1_index+=1
file2_index+=1
if file2_index > (maxrow-1) or file1_index > (maxcol-1):
currentDiagonalArray=[]
currentDiagonalLen=0
#print(maxrow, "greater file2_index than maxrow-1", file2_index)
file1_index=0
prev_file2_index+=1
file2_index=prev_file2_index
if file2_index > (maxrow-1):
#print("MAXIMUM below")
break
# Close db before too many connections
c.close()
waitForInput = input('--> ')
print(waitForInput)
Subscribe to:
Comments (Atom)