Thursday, May 24, 2012

Select from SQLite Database using Python return records that start with a Character

import re
import sqlite3

#----------------setup database
conn = sqlite3.connect('h:\\Python\\ReadWord\\readword.sqlite')
c = conn.cursor()

# Do this instead
#t = (symbol,)
c.execute('''SELECT * FROM uniquephrase WHERE match_text_sum LIKE "a%" ''')

for row in c:
        waitForInput = input('--> ')
        print(row)

# Save (commit) the changes
#conn.commit()
# Close db before too many connections
c.close()



Friday, May 18, 2012

Compare Words in Two Files All Matches and Store in an Sqlite Database using Python

import re
import sqlite3


COMMONWORDS=['the',
'be',
'to',
'of',
'and',
'a',
'in',
'that',
'have',
'I',
'it',
'for',
'not',
'on',
'with',
'he',
'as',
'you',
'do',
'at',
'this',
'but',
'his',
'by',
'from',
'they',
'we',
'say',
'her',
'she',
'or',
'an',
'will',
'my',
'one',
'all',
'would',
'there',
'their',
'what',
'so',
'up',
'out',
'if',
'about',
'who',
'get',
'which',
'go',
'me',
'when',
'make',
'can',
'like',
'time',
'no',
'just',
'him',
'know',
'take',
'person',
'into',
'year',
'your',
'good',
'some',
'could',
'them',
'see',
'other',
'than',
'then',
'now',
'look',
'only',
'come',
'its',
'over',
'think',
'also',
'back',
'after',
'use',
'two',
'how',
'our',
'work',
'first',
'well',
'way',
'even',
'new',
'want',
'because',
'any',
'these',
'give',
'day',
'most',
'us']

#----------------setup database
conn = sqlite3.connect('h:\\Python\\ReadWord\\readword.sqlite')
#for pythonanywhere
#conn = sqlite3.connect('/home/torontoomar/readword3.sqlite')
c = conn.cursor()
# Create table matchtext
c.execute('''DROP TABLE IF EXISTS matchtext''')
c.execute('''create table matchtext(ID INTEGER PRIMARY KEY, match_text text, length_text real,
file1_position real, file2_position real,
file1_pad text, file2_pad text, match_text_sum text, length_sum_text real, uniquephraseID INTEGER)''')
# Create table unique_phrase
c.execute('''DROP TABLE IF EXISTS uniquephrase''')
c.execute('''create table uniquephrase(ID INTEGER PRIMARY KEY, match_text_sum text, length_sum_text real, numtimes real)''')


# Save (commit) the changes
conn.commit()
#----------------end setup database

def checkIfInDatabase(match_text_sum,length_sum_text ):
        tuple_to_select = (match_text_sum,)
        c.execute('''select * from uniquephrase where match_text_sum=?''', tuple_to_select )
        #need to fetchone since can't rely on c.rowcount
        cursorRow = c.fetchone()
        lastrowid=1
        if cursorRow is not None: 
                cursorRowId=cursorRow[0]
                lastrowid=cursorRowId
                print('id:',cursorRowId)
                cursorNumTimes = cursorRow[3]
                print('numTimes:',cursorNumTimes)
                cursorNumTimes +=1
                tuple_to_update = (cursorNumTimes, cursorRowId)
                print(cursorRowId)
                c.execute('''update uniquephrase set numtimes=? where ID=?''', tuple_to_update )
                conn.commit()
        else:
                tuple_to_insert =(match_text_sum,length_sum_text,1)
                c.execute('''insert into uniquephrase(match_text_sum, length_sum_text, numtimes) values (?,?,?)''', tuple_to_insert)        
                conn.commit()
                #only works for insert statement lastrowid
                lastrowid=c.lastrowid

        return lastrowid

def addToDatabase(match_text, length_text, file1_position,file2_position, file1_pad, file2_pad, diagonalArray):

        # go through common words an only pick ones that don't exist
        # if all removed, then special case?
        newDiagonalArray=[]
        for matchWord in diagonalArray:
                uniqueWord=True
                for commonWord in COMMONWORDS:
                        if matchWord == commonWord:
                                uniqueWord=False
                if uniqueWord==True:
                        newDiagonalArray.append(matchWord)

        length_sum_text=len(newDiagonalArray)
        match_text_sum =" ".join(newDiagonalArray)
        #check if in database, if not add as a unique item
        lastrowid=checkIfInDatabase(match_text_sum, length_sum_text)
        
        # id is auto increment if omit, but must name columns
        tuple_to_insert = (match_text, length_text, file1_position,file2_position, file1_pad, file2_pad,match_text_sum, length_sum_text,lastrowid)
        c.execute('''insert into matchtext(match_text, length_text,
                        file1_position, file2_position,file1_pad,
                        file2_pad, match_text_sum, length_sum_text, uniquephraseID) values (?,?,?,?,?,?,?,?,?)''', tuple_to_insert)
        conn.commit()

#for pythonanywhere
#file1 = open('/home/torontoomar/file1.txt', 'r')
file1 = open('h:\\Python\\ReadFile\\file1.txt', 'r')
file1Text = file1.read()
file1.close()

#for pythonanywhere
#file2 = open('/home/torontoomar/file2.txt', 'r')
file2 = open('h:\\Python\\ReadFile\\file2.txt', 'r')
file2Text = file2.read()
file2.close()

file1Text=file1Text.lower()
file2Text=file2Text.lower()

file1List=file1Text.splitlines()
file2List=file2Text.splitlines()
file1Text=" ".join(file1List)
file2Text=" ".join(file2List)


#clean up file1Text and file2Text , and .
file1TextArray = re.findall(r"\w+", file1Text)
file2TextArray = re.findall(r"\w+",file2Text)


len_file1=len(file1TextArray)
len_file2=len(file2TextArray)
diagonal_size=(len_file1+len_file2)-1

currentDiagonalArray=[]
MAX_DIAGONAL_SIZE=3
PAD_SPACE=30

currentDiagonalLen=0

maxrow=len_file2
maxcol=len_file1

print("1,1----------------------------")
#----------------------------------
#do case 1,1
if maxrow==maxcol:
        total_range=maxrow
elif maxrow > maxcol:
        total_range=maxcol
elif maxrow < maxcol:
        total_range=maxrow

file1_index=0
file2_index=0
for i in range(total_range):
#while (1):
        file1Word = file1TextArray[file1_index]
        file2Word = file2TextArray[file2_index]

        #print(file1_index,":",file2_index)
        #print(file1Word," ",file2Word)

        #-----add to database-------
        if file1Word==file2Word:
                #http://wiki.python.org/moin/PythonSpeed/PerformanceTips
                currentDiagonalArray.append(file1Word)


        else:
                if len(currentDiagonalArray) > MAX_DIAGONAL_SIZE:

                        currentDiagonalStr=" ".join(currentDiagonalArray)


                        print(len(currentDiagonalStr),currentDiagonalStr," : ", file1_index , " , ", file2_index)

                        #---------------------------------------------------------
                        padFile1Before=file1_index-PAD_SPACE
                        padFile1After=file1_index+PAD_SPACE

                        padFile2Before=file2_index-PAD_SPACE
                        padFile2After=file2_index+PAD_SPACE

                        quote_file1Array=file1TextArray[padFile1Before:padFile1After]
                        quote_file2Array=file2TextArray[padFile2Before:padFile2After]


                        quote_file1Str=" ".join(quote_file1Array)
                        quote_file2Str=" ".join(quote_file2Array)

                        print("q: ",quote_file1Str)
                        print("b: ",quote_file2Str)
                        print("---------------------------------")
                        #insert into database and commit
                        match_text=currentDiagonalStr
                        length_text=len(currentDiagonalStr)
                        file1_position=file1_index
                        file2_position=file2_index
                        file1_pad=quote_file1Str
                        file2_pad=quote_file2Str
                        addToDatabase(match_text, length_text, file1_position,file2_position, file1_pad, file2_pad, currentDiagonalArray)
                        #---------------------------------------

                currentDiagonalArray=[]
                currentDiagonalLen=0
        #-----end add to database------

        #increment downwards
        file1_index+=1
        file2_index+=1

# C---------------------------
##need to clean up
## DUMP


print("above----------------------------")
#----------------------------------

# do case minrow,column from 2 to maxrow
# startend = (maxcol-varcol) + 1, maxcolumn
# above the line
currentDiagonalArray=[]
currentDiagonalLen=0

file1_index=1
file2_index=0


prev_file1_index=1
counter=0

#for i in range(total_range):
while (1):
        counter+=1

        file1Word = file1TextArray[file1_index]
        file2Word = file2TextArray[file2_index]

        #print(file1_index,":",file2_index)
        #print(file1Word," ",file2Word)

        #-----add to database-------
        if file1Word==file2Word:
                currentDiagonalArray.append(file1Word)


        else:
                if len(currentDiagonalArray) > MAX_DIAGONAL_SIZE:

                        currentDiagonalStr=" ".join(currentDiagonalArray)
                        print(len(currentDiagonalStr),currentDiagonalStr," : ", file1_index , " , ", file2_index)
                        #---------------------------------------------------------
                        padFile1Before=file1_index-PAD_SPACE
                        padFile1After=file1_index+PAD_SPACE

                        padFile2Before=file2_index-PAD_SPACE
                        padFile2After=file2_index+PAD_SPACE

                        quote_file1Array=file1TextArray[padFile1Before:padFile1After]
                        quote_file2Array=file2TextArray[padFile2Before:padFile2After]


                        quote_file1Str=" ".join(quote_file1Array)
                        quote_file2Str=" ".join(quote_file2Array)

                        print("q: ",quote_file1Str)
                        print("b: ",quote_file2Str)
                        print("---------------------------------")
                        #insert into database and commit
                        match_text=currentDiagonalStr
                        length_text=len(currentDiagonalStr)
                        file1_position=file1_index
                        file2_position=file2_index
                        file1_pad=quote_file1Str
                        file2_pad=quote_file2Str
                        addToDatabase(match_text, length_text, file1_position,file2_position, file1_pad, file2_pad, currentDiagonalArray)
                        #---------------------------------------

                currentDiagonalArray=[]
                currentDiagonalLen=0
        #-----end add to database------


        #increment downwards
        file1_index+=1
        file2_index+=1

        if file2_index > (maxrow-1) or file1_index > (maxcol-1):
                currentDiagonalArray=[]
                #print(maxrow, "greater file2_index than maxrow-1", file2_index)
                file2_index=0
                prev_file1_index+=1
                file1_index=prev_file1_index

                if file1_index > (maxcol-1):
                        #print("MAXIMUM above")
                        break

print("counter",counter,"range",total_range)
#print("below----------------------------")
#----------------------------------
# below the line

currentDiagonalArray=[]
currentDiagonalLen=0

file1_index=0
file2_index=1
total_range = ((maxrow-1) * maxcol) - 1
prev_file2_index=1
counter=0
#for i in range(total_range):
while(1):
        counter+=1
        file1Word = file1TextArray[file1_index]
        file2Word = file2TextArray[file2_index]

        #print(file1_index,":",file2_index)
        #print(file1Word," ",file2Word)

        #-----add to database-------
        if file1Word==file2Word:
                currentDiagonalArray.append(file1Word)
                currentDiagonalLen+=1

        else:
                if len(currentDiagonalArray) > MAX_DIAGONAL_SIZE:

                        currentDiagonalStr=" ".join(currentDiagonalArray)
                        print(len(currentDiagonalStr),currentDiagonalStr," : ", file1_index , " , ", file2_index)
                        #---------------------------------------------------------
                        padFile1Before=file1_index-PAD_SPACE
                        padFile1After=file1_index+PAD_SPACE

                        padFile2Before=file2_index-PAD_SPACE
                        padFile2After=file2_index+PAD_SPACE

                        quote_file1Array=file1TextArray[padFile1Before:padFile1After]
                        quote_file2Array=file2TextArray[padFile2Before:padFile2After]


                        quote_file1Str=" ".join(quote_file1Array)
                        quote_file2Str=" ".join(quote_file2Array)

                        print("q: ",quote_file1Str)
                        print("b: ",quote_file2Str)
                        print("---------------------------------")
                        #insert into database and commit
                        match_text=currentDiagonalStr
                        length_text=len(currentDiagonalStr)
                        file1_position=file1_index
                        file2_position=file2_index
                        file1_pad=quote_file1Str
                        file2_pad=quote_file2Str
                        addToDatabase(match_text, length_text, file1_position,file2_position, file1_pad, file2_pad, currentDiagonalArray)
                        #---------------------------------------

                currentDiagonalArray=[]
                currentDiagonalLen=0
        #-----end add to database------


        if(counter % 100000000 == 0):
                print(total_range/counter)


        #increment downwards
        file1_index+=1
        file2_index+=1

        if file2_index > (maxrow-1) or file1_index > (maxcol-1):
                currentDiagonalArray=[]
                currentDiagonalLen=0
                #print(maxrow, "greater file2_index than maxrow-1", file2_index)
                file1_index=0
                prev_file2_index+=1
                file2_index=prev_file2_index

                if file2_index > (maxrow-1):
                        #print("MAXIMUM below")
                        break

# Close db before too many connections
c.close()


waitForInput = input('--> ')
print(waitForInput)