Sunday, March 18, 2012

Read Quran Text file Compare it with Bible and Remove Common Words

import re
wordListPattern=re.compile('[a-z]+',re.IGNORECASE)

#------------------------------------------
# quran
#------------------------------------------
quranFile=open("C:\\Users\\Omar\\Desktop\\QuranEnglishUS\\superquran.txt")
quranLines=quranFile.readlines()
quranFile.close()



lineNumber=0
numberOfLine=len(quranLines)
quranWordList=[]
for line in quranLines:
    line=line.lower()
    lineList=wordListPattern.findall(line)
    quranWordList.extend(lineList)
    lineNumber=lineNumber+1
    #print(lineNumber,":",numberOfLine)
    #print(lineNumber, ":" ,lineList)

print('end quran dump')
quranWordSet=set(quranWordList)


#------------------------------------------
# bible
#------------------------------------------
bibleFile=open("C:\\Users\\Omar\\Desktop\\webtxt\\superbible.txt")
bibleLines=bibleFile.readlines()
bibleFile.close()

lineNumber=0
numberOfLine=len(bibleLines)
bibleWordList=[]
for line in bibleLines:
    line=line.lower()
    lineList=wordListPattern.findall(line)
    bibleWordList.extend(lineList)
    lineNumber=lineNumber+1
    #print("Bible ",lineNumber,":",numberOfLine)

print('end bible dump')
bibleWordSet=set(bibleWordList)

#------------------------------------------
# common word list
#------------------------------------------
commonFile=open("C:\\Users\\Omar\\Desktop\\commonword1000.txt")
commonLines=commonFile.readlines()
commonFile.close()

lineNumber=0
numberOfLine=len(commonLines)
commonWordList=[]
for line in commonLines:
    line=line.lower()
    lineList=wordListPattern.findall(line)
    commonWordList.extend(lineList)
    lineNumber=lineNumber+1

print('end common dump')
commonWordSet=set(commonWordList)


intersectionWordSet = quranWordSet.intersection(bibleWordSet)
differenceWordSet = intersectionWordSet.difference(commonWordSet)
sortedWordSet= sorted(differenceWordSet)

#print(sorted(intersectionWordSet))
print('---cleaned----')
#print())
for s in sortedWordSet:
    print (s)

No comments:

Post a Comment