# Given an "fna" file, this counts all substrings with length 2 and # print them in decreasing order with their frequecies def load(file): ''' load DNA sequence from given "file" and return the sequence as a string ''' import re # regular expression modeule f = open(file, "r") alllines = f.readlines() # read all lines del alllines[0] # contents begin with the second line line = ''.join(alllines) # join all lines together line = re.sub('\n', '', line)# remove "KAIGYOU" from line return line def countsubstrings(str): ''' count all substrings with length 2 and return count = [(n1, str1), (n2, str2), ...] the order is not considered ''' count = [] for i in range(len(str)): substr = str[i:i+2] # substring with length 2 if len(substr) < 2: continue # substr must have at least two length # check if 'substr' is in count for i in range(len(count)): (n, s) = count[i] if s == substr: count[i] = (n+1, s) break else: # this part is executed only if no 'break' in the above for loop count.append((1, substr)) return(count) if __name__ == '__main__': import sys import time # time module start = time.time() # current time seq = load(sys.argv[1]) # load DNA sequence #print seq # for debug count = countsubstrings(seq) #print count # for debug #count.sort() #count.reverse() # print all substrings in decreasing order with their frequencies for (n, s) in count: print s, " occurs ", n, " times" end = time.time() # current time print end - start # print required time