Friday, May 18, 2012

Compare Words in Two Files All Matches and Store in an Sqlite Database using Python

import re
import sqlite3


COMMONWORDS=['the',
'be',
'to',
'of',
'and',
'a',
'in',
'that',
'have',
'I',
'it',
'for',
'not',
'on',
'with',
'he',
'as',
'you',
'do',
'at',
'this',
'but',
'his',
'by',
'from',
'they',
'we',
'say',
'her',
'she',
'or',
'an',
'will',
'my',
'one',
'all',
'would',
'there',
'their',
'what',
'so',
'up',
'out',
'if',
'about',
'who',
'get',
'which',
'go',
'me',
'when',
'make',
'can',
'like',
'time',
'no',
'just',
'him',
'know',
'take',
'person',
'into',
'year',
'your',
'good',
'some',
'could',
'them',
'see',
'other',
'than',
'then',
'now',
'look',
'only',
'come',
'its',
'over',
'think',
'also',
'back',
'after',
'use',
'two',
'how',
'our',
'work',
'first',
'well',
'way',
'even',
'new',
'want',
'because',
'any',
'these',
'give',
'day',
'most',
'us']

#----------------setup database
conn = sqlite3.connect('h:\\Python\\ReadWord\\readword.sqlite')
#for pythonanywhere
#conn = sqlite3.connect('/home/torontoomar/readword3.sqlite')
c = conn.cursor()
# Create table matchtext
c.execute('''DROP TABLE IF EXISTS matchtext''')
c.execute('''create table matchtext(ID INTEGER PRIMARY KEY, match_text text, length_text real,
file1_position real, file2_position real,
file1_pad text, file2_pad text, match_text_sum text, length_sum_text real, uniquephraseID INTEGER)''')
# Create table unique_phrase
c.execute('''DROP TABLE IF EXISTS uniquephrase''')
c.execute('''create table uniquephrase(ID INTEGER PRIMARY KEY, match_text_sum text, length_sum_text real, numtimes real)''')


# Save (commit) the changes
conn.commit()
#----------------end setup database

def checkIfInDatabase(match_text_sum,length_sum_text ):
        tuple_to_select = (match_text_sum,)
        c.execute('''select * from uniquephrase where match_text_sum=?''', tuple_to_select )
        #need to fetchone since can't rely on c.rowcount
        cursorRow = c.fetchone()
        lastrowid=1
        if cursorRow is not None: 
                cursorRowId=cursorRow[0]
                lastrowid=cursorRowId
                print('id:',cursorRowId)
                cursorNumTimes = cursorRow[3]
                print('numTimes:',cursorNumTimes)
                cursorNumTimes +=1
                tuple_to_update = (cursorNumTimes, cursorRowId)
                print(cursorRowId)
                c.execute('''update uniquephrase set numtimes=? where ID=?''', tuple_to_update )
                conn.commit()
        else:
                tuple_to_insert =(match_text_sum,length_sum_text,1)
                c.execute('''insert into uniquephrase(match_text_sum, length_sum_text, numtimes) values (?,?,?)''', tuple_to_insert)        
                conn.commit()
                #only works for insert statement lastrowid
                lastrowid=c.lastrowid

        return lastrowid

def addToDatabase(match_text, length_text, file1_position,file2_position, file1_pad, file2_pad, diagonalArray):

        # go through common words an only pick ones that don't exist
        # if all removed, then special case?
        newDiagonalArray=[]
        for matchWord in diagonalArray:
                uniqueWord=True
                for commonWord in COMMONWORDS:
                        if matchWord == commonWord:
                                uniqueWord=False
                if uniqueWord==True:
                        newDiagonalArray.append(matchWord)

        length_sum_text=len(newDiagonalArray)
        match_text_sum =" ".join(newDiagonalArray)
        #check if in database, if not add as a unique item
        lastrowid=checkIfInDatabase(match_text_sum, length_sum_text)
        
        # id is auto increment if omit, but must name columns
        tuple_to_insert = (match_text, length_text, file1_position,file2_position, file1_pad, file2_pad,match_text_sum, length_sum_text,lastrowid)
        c.execute('''insert into matchtext(match_text, length_text,
                        file1_position, file2_position,file1_pad,
                        file2_pad, match_text_sum, length_sum_text, uniquephraseID) values (?,?,?,?,?,?,?,?,?)''', tuple_to_insert)
        conn.commit()

#for pythonanywhere
#file1 = open('/home/torontoomar/file1.txt', 'r')
file1 = open('h:\\Python\\ReadFile\\file1.txt', 'r')
file1Text = file1.read()
file1.close()

#for pythonanywhere
#file2 = open('/home/torontoomar/file2.txt', 'r')
file2 = open('h:\\Python\\ReadFile\\file2.txt', 'r')
file2Text = file2.read()
file2.close()

file1Text=file1Text.lower()
file2Text=file2Text.lower()

file1List=file1Text.splitlines()
file2List=file2Text.splitlines()
file1Text=" ".join(file1List)
file2Text=" ".join(file2List)


#clean up file1Text and file2Text , and .
file1TextArray = re.findall(r"\w+", file1Text)
file2TextArray = re.findall(r"\w+",file2Text)


len_file1=len(file1TextArray)
len_file2=len(file2TextArray)
diagonal_size=(len_file1+len_file2)-1

currentDiagonalArray=[]
MAX_DIAGONAL_SIZE=3
PAD_SPACE=30

currentDiagonalLen=0

maxrow=len_file2
maxcol=len_file1

print("1,1----------------------------")
#----------------------------------
#do case 1,1
if maxrow==maxcol:
        total_range=maxrow
elif maxrow > maxcol:
        total_range=maxcol
elif maxrow < maxcol:
        total_range=maxrow

file1_index=0
file2_index=0
for i in range(total_range):
#while (1):
        file1Word = file1TextArray[file1_index]
        file2Word = file2TextArray[file2_index]

        #print(file1_index,":",file2_index)
        #print(file1Word," ",file2Word)

        #-----add to database-------
        if file1Word==file2Word:
                #http://wiki.python.org/moin/PythonSpeed/PerformanceTips
                currentDiagonalArray.append(file1Word)


        else:
                if len(currentDiagonalArray) > MAX_DIAGONAL_SIZE:

                        currentDiagonalStr=" ".join(currentDiagonalArray)


                        print(len(currentDiagonalStr),currentDiagonalStr," : ", file1_index , " , ", file2_index)

                        #---------------------------------------------------------
                        padFile1Before=file1_index-PAD_SPACE
                        padFile1After=file1_index+PAD_SPACE

                        padFile2Before=file2_index-PAD_SPACE
                        padFile2After=file2_index+PAD_SPACE

                        quote_file1Array=file1TextArray[padFile1Before:padFile1After]
                        quote_file2Array=file2TextArray[padFile2Before:padFile2After]


                        quote_file1Str=" ".join(quote_file1Array)
                        quote_file2Str=" ".join(quote_file2Array)

                        print("q: ",quote_file1Str)
                        print("b: ",quote_file2Str)
                        print("---------------------------------")
                        #insert into database and commit
                        match_text=currentDiagonalStr
                        length_text=len(currentDiagonalStr)
                        file1_position=file1_index
                        file2_position=file2_index
                        file1_pad=quote_file1Str
                        file2_pad=quote_file2Str
                        addToDatabase(match_text, length_text, file1_position,file2_position, file1_pad, file2_pad, currentDiagonalArray)
                        #---------------------------------------

                currentDiagonalArray=[]
                currentDiagonalLen=0
        #-----end add to database------

        #increment downwards
        file1_index+=1
        file2_index+=1

# C---------------------------
##need to clean up
## DUMP


print("above----------------------------")
#----------------------------------

# do case minrow,column from 2 to maxrow
# startend = (maxcol-varcol) + 1, maxcolumn
# above the line
currentDiagonalArray=[]
currentDiagonalLen=0

file1_index=1
file2_index=0


prev_file1_index=1
counter=0

#for i in range(total_range):
while (1):
        counter+=1

        file1Word = file1TextArray[file1_index]
        file2Word = file2TextArray[file2_index]

        #print(file1_index,":",file2_index)
        #print(file1Word," ",file2Word)

        #-----add to database-------
        if file1Word==file2Word:
                currentDiagonalArray.append(file1Word)


        else:
                if len(currentDiagonalArray) > MAX_DIAGONAL_SIZE:

                        currentDiagonalStr=" ".join(currentDiagonalArray)
                        print(len(currentDiagonalStr),currentDiagonalStr," : ", file1_index , " , ", file2_index)
                        #---------------------------------------------------------
                        padFile1Before=file1_index-PAD_SPACE
                        padFile1After=file1_index+PAD_SPACE

                        padFile2Before=file2_index-PAD_SPACE
                        padFile2After=file2_index+PAD_SPACE

                        quote_file1Array=file1TextArray[padFile1Before:padFile1After]
                        quote_file2Array=file2TextArray[padFile2Before:padFile2After]


                        quote_file1Str=" ".join(quote_file1Array)
                        quote_file2Str=" ".join(quote_file2Array)

                        print("q: ",quote_file1Str)
                        print("b: ",quote_file2Str)
                        print("---------------------------------")
                        #insert into database and commit
                        match_text=currentDiagonalStr
                        length_text=len(currentDiagonalStr)
                        file1_position=file1_index
                        file2_position=file2_index
                        file1_pad=quote_file1Str
                        file2_pad=quote_file2Str
                        addToDatabase(match_text, length_text, file1_position,file2_position, file1_pad, file2_pad, currentDiagonalArray)
                        #---------------------------------------

                currentDiagonalArray=[]
                currentDiagonalLen=0
        #-----end add to database------


        #increment downwards
        file1_index+=1
        file2_index+=1

        if file2_index > (maxrow-1) or file1_index > (maxcol-1):
                currentDiagonalArray=[]
                #print(maxrow, "greater file2_index than maxrow-1", file2_index)
                file2_index=0
                prev_file1_index+=1
                file1_index=prev_file1_index

                if file1_index > (maxcol-1):
                        #print("MAXIMUM above")
                        break

print("counter",counter,"range",total_range)
#print("below----------------------------")
#----------------------------------
# below the line

currentDiagonalArray=[]
currentDiagonalLen=0

file1_index=0
file2_index=1
total_range = ((maxrow-1) * maxcol) - 1
prev_file2_index=1
counter=0
#for i in range(total_range):
while(1):
        counter+=1
        file1Word = file1TextArray[file1_index]
        file2Word = file2TextArray[file2_index]

        #print(file1_index,":",file2_index)
        #print(file1Word," ",file2Word)

        #-----add to database-------
        if file1Word==file2Word:
                currentDiagonalArray.append(file1Word)
                currentDiagonalLen+=1

        else:
                if len(currentDiagonalArray) > MAX_DIAGONAL_SIZE:

                        currentDiagonalStr=" ".join(currentDiagonalArray)
                        print(len(currentDiagonalStr),currentDiagonalStr," : ", file1_index , " , ", file2_index)
                        #---------------------------------------------------------
                        padFile1Before=file1_index-PAD_SPACE
                        padFile1After=file1_index+PAD_SPACE

                        padFile2Before=file2_index-PAD_SPACE
                        padFile2After=file2_index+PAD_SPACE

                        quote_file1Array=file1TextArray[padFile1Before:padFile1After]
                        quote_file2Array=file2TextArray[padFile2Before:padFile2After]


                        quote_file1Str=" ".join(quote_file1Array)
                        quote_file2Str=" ".join(quote_file2Array)

                        print("q: ",quote_file1Str)
                        print("b: ",quote_file2Str)
                        print("---------------------------------")
                        #insert into database and commit
                        match_text=currentDiagonalStr
                        length_text=len(currentDiagonalStr)
                        file1_position=file1_index
                        file2_position=file2_index
                        file1_pad=quote_file1Str
                        file2_pad=quote_file2Str
                        addToDatabase(match_text, length_text, file1_position,file2_position, file1_pad, file2_pad, currentDiagonalArray)
                        #---------------------------------------

                currentDiagonalArray=[]
                currentDiagonalLen=0
        #-----end add to database------


        if(counter % 100000000 == 0):
                print(total_range/counter)


        #increment downwards
        file1_index+=1
        file2_index+=1

        if file2_index > (maxrow-1) or file1_index > (maxcol-1):
                currentDiagonalArray=[]
                currentDiagonalLen=0
                #print(maxrow, "greater file2_index than maxrow-1", file2_index)
                file1_index=0
                prev_file2_index+=1
                file2_index=prev_file2_index

                if file2_index > (maxrow-1):
                        #print("MAXIMUM below")
                        break

# Close db before too many connections
c.close()


waitForInput = input('--> ')
print(waitForInput)

No comments:

Post a Comment