# Given an "fna" file, this counts all substrings with length 2 and # print them in decreasing order with their frequecies def load(file): ''' load DNA sequence from given "file" and return the sequence as a string ''' import re # regular expression modeule f = open(file, "r") alllines = f.readlines() # read all lines del alllines[0] # contents begin with the second line line = ''.join(alllines) # join all lines together line = re.sub('\n', '', line)# remove "KAIGYOU" from line return line def countsubstrings(str): dict = {} # dictionary: key => substring , value => its frequency for i in range(len(str)): substr = str[i:i+2] # substring with length 2 if len(substr) < 2: continue # substr must have at least two length if dict.has_key(substr): dict[substr] = dict[substr] + 1 else: dict[substr] = 1 return(dict) if __name__ == '__main__': import sys import time # time module start = time.time() # current time import CountByDict2 seq = CountByDict2.load(sys.argv[1]) # load DNA sequence #print seq # for debug count = countsubstrings(seq) #print count # for debug # dictionary data strucuture does NOT have sort and reverse methods #count.sort() #count.reverse() # print all substrings in decreasing order with their frequencies for (k, v) in count.items(): print k, " occurs ", v, " times" end = time.time() # current time print end - start # print required time