#! /usr/bin/python #updated 06/29/04 #weili@jimmy.harvard.edu import re import string import sys # replace simple repeats (single > 6-mer, double > 8-mer, and 'N' > 3-mer) # with 'NNN' # Usage: python rmRepeats.py infilename outfilename # compile the repeat patterns repeats = [re.compile("N{3,}"), re.compile("n{3,}")] for base1 in ("A", "C", "G", "T"): repeats.append(re.compile("%s{6,}" % base1)) for base2 in ("A", "C", "G", "T"): repeats.append(re.compile("(%s%s){4,}" % (base1, base2))) for base1 in ("a", "c", "g", "t"): repeats.append(re.compile("%s{6,}" % base1)) for base2 in ("a", "c", "g", "t"): repeats.append(re.compile("(%s%s){4,}" % (base1, base2))) # compile pattern for the first line of each sequence seqname = re.compile(">(.+)\n") def matchAndPrint(id, seq): # delete repeats for repeat in repeats: # updated by weili mweili = repeat.search(seq) if mweili: l = len(mweili.group(0)) seq = repeat.sub('N'*l, seq) # ready to print outfile.write(">%s\n%s\n" % (id, seq)); infilename = sys.argv[1] outfilename = sys.argv[2] print "In file", infilename, "Out file", outfilename first = 1 seqID = sequence = "" # please write the name of the file you are using infile = open(infilename, 'r') outfile = open(outfilename, 'w') for line in infile.readlines(): # got a new sequence m = seqname.match(line) if (m): if (first == 0): matchAndPrint(seqID, sequence) else: first = 0 seqID = m.group(1) sequence = "" # combine the actual sequence else: line = string.rstrip(line) sequence = sequence + line # print out the last sequence else: matchAndPrint(seqID, sequence) infile.close() outfile.close()