/* #define DEBUG_SPUTNIK 1 */ /* find repeats in fasta format seq file allows for indels, returns score. beta version. caveat emptor. chrisa 29-Jul-94 chris abajian University of Washington Dept. of Molecular Biotechnology FJ-20 Fluke Hall, Mason Road Seattle WA 98195 */ /* Small alterations made to Chris' code to make the program more generally useable Wim Glassee VIB - Department of Molecular Genetics University of Antwerp - CDE Parking P4, Building V, Room 0.07 Universiteitsplein 1 BE-2610 Antwerpen Belgium E-mail: wim.glassee@ua.ac.be Website: http://www.molgen.ua.ac.be 10/12/2002, Wim, Altered the code so that the sequence length is limited by the file size, not MAX_SEQUENCE_LEN 06/01/2003, Wim, Allow for singular repeats 14/03/2005 Wim, don't ignore mononucleotide repeats 21/03/2005 Wim, Don't ignore mono's when looking for stretches! Changed the minimum hit length to 10 */ #include #include #include #include #include #include #include /* 10/12/02, Wim */ #include /* 20/10/03, Wim */ /* trivial defs */ #ifndef True #define True 1 #endif #ifndef False #define False 0 #endif typedef int Boolean; /* size of buffer for reads. */ #define BUF_SIZE 1024*10 /* 10K */ /* max size of description line (begins with ">") */ #define MAX_DESCRIPTION_LEN 1024 /* max sequence length */ #define MAX_SEQUENCE_LEN 1024*5000 /* 5000K */ /* max number of sequence chars dumped to line */ #define MAX_OUT_LINE_CHARS 60 /* for debugging only */ #define MAX_ERRCODES 1024 /* 06/01/03, Wim, convert defines to int */ /* search params and definitions */ // #define MIN_UNIT_LENGTH 2 /* start search with dinucleotide repeats */ int MIN_UNIT_LENGTH = 2; /* will search for di, tri, tetra ... nucleotide repeats up to this value for n */ //#define MAX_UNIT_LENGTH 20 /* up to and including pentanucleotides */ int MAX_UNIT_LENGTH = 20; /* this is the point score for each exact match */ //#define EXACT_MATCH_POINTS 1 int EXACT_MATCH_POINTS = 1; /* this is the point score for a mismatch, insertion or deletion */ //#define ERROR_MATCH_POINTS -6 int ERROR_MATCH_POINTS = -6; /* this is the minimum score required to be considered a match */ //#define MATCH_MIN_SCORE 8 int MATCH_MIN_SCORE = 8; /* 14/03/05 Wim, lower match min score to 6 */ // int MATCH_MIN_SCORE = 6; /* this is the low score at which we stop trying */ //#define MATCH_FAIL_SCORE -1 int MATCH_FAIL_SCORE = -1; /* this is the max recursion depth we try to recover errors */ //#define MAX_RECURSION 5 int MAX_RECURSION = 5; /* 06/01/03, Wim, removed array size */ /* char *repeatName[MAX_UNIT_LENGTH+1] = */ char *repeatName[] = { "***ERROR***", /* bad programmer! no latte! */ "single nucleotide", "dinucleotide", "trinucleotide", "tetranucleotide", "pentanucleotide", "6", "7", "8", "9", "10", "11", "12", }; char readBuf[BUF_SIZE]; Boolean endOfFile; int curBufLen; int curBufPos; int fd; Boolean havePutBack; char putBack; /* struct for indiv sequence in a file */ typedef struct ss { char descStr[MAX_DESCRIPTION_LEN]; /* 10/12/2002, Wim */ /* char seqStr[MAX_SEQUENCE_LEN];*/ char * seqStr; unsigned int seqLen; } SeqStruct, *SeqStructPtr; /* * this structure describes the current state of a comparison. * it gets passed down to recursive calls of the find repeat * call so it can know when to bail out of an unsuccessful * search, or return the size/state of a successful hit, etc. */ typedef struct ms { int curPos; /* putative pattern starts here */ int testPos; /* start testing here */ int testLen; /* di, tri, tetra, etc. */ int testCtr; /* # chars in testLen already tested. mod counter */ int curScore; /* current score */ int missense; /* keep track of ins, del, err */ int insertions; int deletions; int depth; /* how deep is recursion for this match */ char errCodes[MAX_ERRCODES]; } MatchStruct, *MatchStructPtr; /* a utility macro to copy one testStruct to another */ #define copyMSPtr(dest,source) memcpy((char *)dest,(char *)source,sizeof(MatchStruct)) /* a utility macro to increment the modular testCtr */ #define bumpTestCtr(msp) (msp)->testCtr++; if ((msp)->testCtr==(msp)->testLen) (msp)->testCtr=0; /* ************************************************************ * these routines are used to read and parse the fasta format * sequence file ************************************************************ */ void fillBuf() { size_t result; result = read(fd, (void *)readBuf, BUF_SIZE); if (result == -1) { fprintf(stderr,"error reading file! errno = %d\n",errno); exit(1); } else if (result == 0) endOfFile = True; else { curBufLen = result; curBufPos = 0; } } /* readBuf */ /* returns True on success */ Boolean getChar(char *achar) { if (havePutBack) { *achar = putBack; havePutBack = False; return(True); } if (curBufPos == curBufLen) fillBuf(); if (endOfFile) return (False); *achar = readBuf[curBufPos++]; return (True); } void putCharBack(char c) { havePutBack = True; putBack = c; } void openFile(char *fn) { /* open the specified file */ fd = open(fn, O_RDONLY); if (fd == -1) { fprintf(stderr,"unable to open file %s\n", fn); exit(1); } } /* should call this once for each file read */ void initBuffer() { /* initialize length and pointer */ curBufPos = 0; curBufLen = 0; havePutBack = False; endOfFile = False; } void addCharToLine(char c, char *line, int *lineLen) { if (*lineLen < MAX_DESCRIPTION_LEN) line[(*lineLen)++] = c; else fprintf(stderr,"warning: description line truncated\n"); } /* ********************************************************************* * these routines are (more) specific to reading the fasta file format ********************************************************************* */ /* * pick up a non-blank line from the file, presumably description. * truncates all leading blanks and/or blank lines */ Boolean getNonBlankLine(char *line) { Boolean stop, nonBlank; char c; int lineLen; lineLen = 0; stop = False; nonBlank = False; /* will be set by any non whitespace char */ while ((! endOfFile) && (! stop)) if (getChar(&c)) if (c == '\n') stop = nonBlank; /* stop if have anything. don't save eol char. */ else if (nonBlank) /* add it to line no matter what */ addCharToLine(c,line,&lineLen); else if ((c != ' ') && (c != '\t')) { /* only non whitespace will start the line */ nonBlank = True; addCharToLine(c,line,&lineLen); } } /* load the sequence struct with comment line and bases */ SeqStructPtr getSeq(char *fname) { SeqStructPtr newSeqP; Boolean endOfSeq; char c; /* 10/12/2002, Wim */ struct stat mystat; if (endOfFile) return ((SeqStructPtr )0); /* bombproofing */ /* malloc a new seq */ if (! (newSeqP = (SeqStructPtr )malloc(sizeof(SeqStruct)) ) ) { fprintf(stderr,"unable to malloc() memory for sequence.\n"); exit(1); } /* clear mem */ memset( (void *)newSeqP, '\0', sizeof(SeqStruct)); /* 10/12/2002, Wim */ /* get the file size */ if( stat(fname, &mystat) == -1 ) { fprintf(stderr, "error occured attempting to get file size\n"); exit(0); } /* 10/12/2002, Wim */ /* Allocate memory to hold the sequence */ if( ! (newSeqP->seqStr = (char *) malloc(sizeof(char *) * mystat.st_size)) ) { fprintf(stderr,"unable to malloc() memory for sequence.\n"); exit(1); } /* pick up description line */ if (! getNonBlankLine(newSeqP->descStr) ) { free(newSeqP); return ((SeqStructPtr )0); } /* did it start correctly ? */ if (newSeqP->descStr[0] != '>') { fprintf(stderr,"format error in input file: missing '>'\n"); exit(1); } endOfSeq = False; while ((!endOfFile) && (!endOfSeq)) { if (getChar(&c)) { if (c == '>') { /* hit new sequence */ endOfSeq = True; putCharBack(c); } else if (((c >= 'A') && (c <= 'Z')) || ((c >= 'a') && (c <= 'z')))/* bogus test, chris */ /* have nucleotide */ newSeqP->seqStr[newSeqP->seqLen++] = toupper(c); else if ((c != '\n') && (c != ' ') && (c != '\t')) { /* wierd shit in file. bail. */ fprintf(stderr,"bad char in sequence, file %s: %c\n",fname,c); exit(1); } } } if (! newSeqP->seqLen) { fprintf(stderr,"? Null sequence encountered in file %s (ignored)\n",fname); fprintf(stderr," %s\n", newSeqP->descStr); free(newSeqP); return ((SeqStructPtr )0); } return(newSeqP); } /* getSeq */ /* for debugging. dump entire seq to stdout. */ #ifdef DEBUG_SPUTNIK void dumpSeq(SeqStructPtr seqP) { int i, charsOnLine; fprintf(stdout,"%s\n", seqP->descStr); fprintf(stdout,"Sequence (length = %d):\n", seqP->seqLen); i = 0; charsOnLine = 0; while (i < seqP->seqLen) { if (charsOnLine == MAX_OUT_LINE_CHARS) { fprintf(stdout,"\n"); charsOnLine = 1; } else charsOnLine++; fprintf(stdout,"%c", seqP->seqStr[i++]); } fprintf(stdout,"\n"); } /* dumpSeq */ #endif /* DEBUG_SPUTNIK */ /* dump the matched seq & stats to stdout */ void dumpMatch(SeqStructPtr seqP, MatchStructPtr matchP, Boolean anyMatchThisSeq) { int i, charsOnLine; if (! anyMatchThisSeq) fprintf(stdout,"%s\n", seqP->descStr); /* fprintf(stdout,"%s %d : %d -- length %d score %d | unit length: %d number: %-4.1f\n", repeatName[matchP->testLen], matchP->curPos+1, matchP->testPos, matchP->testPos - matchP->curPos, matchP->curScore, matchP->testLen, (matchP->testPos - matchP->curPos)/(double)(matchP->testLen)); */ i = matchP->curPos; charsOnLine = 0; while (i < matchP->curPos+matchP->testLen) { fprintf(stdout,"%c", seqP->seqStr[i++]); } fprintf(stdout,"\t%d\t%-4.1f\t%d\t%d\t%d\t%d\t", matchP->testLen, (matchP->testPos - matchP->curPos)/(double)(matchP->testLen), matchP->curPos+1, matchP->testPos, matchP->testPos - matchP->curPos, matchP->curScore); #ifdef DEBUG_SPUTNIK fprintf(stdout,"mis = %d, del = %d, ins = %d\n", matchP->missense, matchP->deletions, matchP->insertions); #endif i = matchP->curPos; while (i < matchP->testPos) { fprintf(stdout,"%c", seqP->seqStr[i++]); } fprintf(stdout,"\n"); #ifdef DEBUG_SPUTNIK i = 0; charsOnLine = 0; while (i < (matchP->testPos - matchP->curPos)) { if (charsOnLine == MAX_OUT_LINE_CHARS) { fprintf(stdout,"\n"); charsOnLine = 1; } else charsOnLine++; if (matchP->errCodes[i] == '\0') fprintf(stdout," "); else fprintf(stdout,"%c", matchP->errCodes[i]); i++; } fprintf(stdout,"\n"); #endif } /* dumpMatch */ Boolean testForNRepeat(SeqStructPtr seqP, MatchStructPtr matchP) { MatchStruct curMatch, recover, bestSoFar, bestOfABadLot; /* save matchP in case we fail altogether. */ copyMSPtr(&curMatch, matchP); /* keep track of the best score and return that if over thresh. */ copyMSPtr(&bestSoFar, matchP); while ( (curMatch.testPos < seqP->seqLen) /* anything to test */ && (curMatch.curScore > MATCH_FAIL_SCORE) ) /* above fail threshold */ { /* test a base */ if (seqP->seqStr[curMatch.curPos+curMatch.testCtr] == seqP->seqStr[curMatch.testPos]) { /* we matched. this is easy. */ curMatch.curScore += EXACT_MATCH_POINTS; /* score your points */ curMatch.testPos++; /* advance the downstream test position */ bumpTestCtr(&curMatch); /* advance pos in the (presumed) repeating seq */ } else if (seqP->seqStr[curMatch.testPos] == 'N') { /* don't call it wrong, but no credit either */ curMatch.testPos++; /* advance the downstream test position */ bumpTestCtr(&curMatch); /* advance pos in the (presumed) repeating seq */ } else { /* no match. take the score penalty, but keep going (maybe). */ curMatch.curScore += ERROR_MATCH_POINTS; curMatch.testPos++; /* advance the downstream test position */ bumpTestCtr(&curMatch); /* advance pos in seq */ /* is the score too bad to continue, or are we already too deep? */ if ( (curMatch.curScore > MATCH_FAIL_SCORE) && (curMatch.depth < MAX_RECURSION) ) { /* try simple missense */ copyMSPtr(&recover,&curMatch); if ((recover.testPos - recover.curPos) < MAX_ERRCODES) recover.errCodes[recover.testPos - recover.curPos -1] = 'M'; recover.missense++; recover.depth++; (void )testForNRepeat(seqP,&recover); copyMSPtr(&bestOfABadLot,&recover); /* try deletion */ copyMSPtr(&recover,&curMatch); if ((recover.testPos - recover.curPos) < MAX_ERRCODES) recover.errCodes[recover.testPos - recover.curPos -1] = 'D'; recover.testPos--; /* DON'T advance downstream */ recover.deletions++; recover.depth++; (void )testForNRepeat(seqP,&recover); if (recover.curScore > bestOfABadLot.curScore) copyMSPtr(&bestOfABadLot,&recover); /* try insertion */ copyMSPtr(&recover,&curMatch); if ((recover.testPos - recover.curPos) < MAX_ERRCODES) recover.errCodes[recover.testPos - recover.curPos -1] = 'I'; /* RETEST for this base in the repeating seq */ if (recover.testCtr == 0) recover.testCtr = recover.testLen - 1; else recover.testCtr--; recover.insertions++; recover.depth++; (void )testForNRepeat(seqP,&recover); if (recover.curScore > bestOfABadLot.curScore) copyMSPtr(&bestOfABadLot,&recover); /* take the best of a bad lot */ bestOfABadLot.depth--; /* dec recursion counter */ copyMSPtr(&curMatch, &bestOfABadLot); } /* it was worth carrying on */ } /* no match, found best of bad lot */ /* whatever happened, the best we could do is now in matchP */ if (curMatch.curScore > bestSoFar.curScore) copyMSPtr(&bestSoFar, &curMatch); } /* while loop to test a single base */ /* for whatever reason, we've stopped searching for more of this putative repeat. if there were any matches that passed the global threshold, return the best of them. note that this has the effect of NOT advancing the pointer(s) if nothing rang the bell. remember that we will test the same position for ntide repeats of several different lengths. */ if (bestSoFar.curScore > MATCH_MIN_SCORE) { copyMSPtr(matchP, &bestSoFar); return(True); } return(False); /* the whole thing was a waste of time */ } /* testForNRepeat */ /* * returns True if the sequence we want to look for repeats of is * * a) all the same base (i.e. 'AAA' or 'GG'). This filters out * single nucleotide repeats * * b) conains 'N'. we search against these, but don't use them * as wildcards. */ // Add allowMono, to allow Mononucleotide repeats Boolean ignoreSeq(SeqStructPtr seqP, MatchStructPtr matchP, Boolean allowMono) { int i; /* firstly, never search for any pattern that contains N */ for (i = 0; i < matchP->testLen; i++) if (seqP->seqStr[matchP->curPos+i] == 'N') return(True); /* now test for mononucleotide repeat. other tests may get added, in which case this one will beed to be changed. */ /* 06/01/03, Wim, so as not to ignore single repeats */ if (matchP->testLen == 1) return(False); /* 14/03/05, Wim, Do not filter out mononucleotide repeats */ if( allowMono == True ) return (False); else { for (i = 1; i < matchP->testLen; i++) if (seqP->seqStr[matchP->curPos] != seqP->seqStr[matchP->curPos+i]) return(False); /* they're not all the same */ return (True); /* they ARE all same */ } } /* 06/01/03, Wim, add an argument to decide on the printing of a defline */ void findRepeats(SeqStructPtr seqP, Boolean anyMatchThisSeq, Boolean allowMono ) { int curPos; Boolean /*anyMatchThisSeq,*/ matchAtThisPos; MatchStruct match; memset( (char *)&match, 0, sizeof(MatchStruct) ); /* clear match struct */ // anyMatchThisSeq = False; /* avoid dumping description more than once. */ /* loop on all positions in the sequence. note that a match will advance curPos past all matching chars to the first unmatched char. */ while ( match.curPos <= seqP->seqLen) { /* now loop on all the different lengths of repeats we're looking for (i.e. di, tri, tetra nucleotides. if we find a match at a shorter repeat length, forego testing for longer lengths. */ match.testLen = MIN_UNIT_LENGTH; matchAtThisPos = False; while ((match.testLen <= MAX_UNIT_LENGTH) && (!matchAtThisPos)) { /* initialize the state of the match */ match.curScore = 0; /* no points yet */ match.testCtr = 0; /* no chars tested yet */ match.testPos = match.curPos + match.testLen; match.insertions = 0; match.deletions = 0; match.missense = 0; /* there are some things we don't want to test for */ if (! ignoreSeq(seqP,&match, allowMono)) matchAtThisPos = testForNRepeat(seqP, &match); else matchAtThisPos = False; if (! matchAtThisPos) match.testLen++; } if (matchAtThisPos) { dumpMatch(seqP,&match,anyMatchThisSeq); anyMatchThisSeq |= matchAtThisPos; match.curPos = match.testPos; } else match.curPos++; /* no, so advance to next base. */ /* if (!(match.curPos % 1000)) { fprintf(stderr,".");fflush(stderr); } */ } } main(int argc, char* argv[]) { SeqStructPtr seqP; int count; char * filename; int stretches = 0; int simple = 1; /* 06/01/03, Wim, added -s and -N parameter, pretty awkward parameter parsing btw */ if (argc < 2) { /* fprintf(stderr,"Usage: %s \n", argv[0]); */ fprintf(stderr,"Usage: %s [-s] \n", argv[0]); fprintf(stderr,"\t-s\t show stretches\n"); fprintf(stderr,"\t-N\t do NOT show simple repeats\n"); exit(1); } if( argc >= 3 ) { if ( strcmp(argv[1], "-s") == 0 ) { stretches = 1; } else if ( strcmp(argv[1], "-N" ) == 0 ) { simple = 0; } } if( argc >= 4 ) { if ( strcmp(argv[2], "-s") == 0 ) { stretches = 1; } else if ( strcmp(argv[2], "-N" ) == 0 ) { simple = 0; } } filename = argv[argc-1]; openFile(filename); initBuffer(); fprintf(stdout,"Unit\tUnit size\tUnit number\tBegin position\tEnd position\tLength\tscore\tSequence\n"); count = 0; while (! endOfFile) if (seqP = getSeq(filename)) { #ifdef DEBUG_SPUTNIK fprintf(stdout,"processing sequence %d\n", count++); #endif /* dumpSeq(seqP); */ if( simple ) findRepeats(seqP, False, False); if( stretches ) { MIN_UNIT_LENGTH = 1; MAX_UNIT_LENGTH = 1; EXACT_MATCH_POINTS = 1; MATCH_MIN_SCORE = 9; MAX_RECURSION = 2; ERROR_MATCH_POINTS = -1000; /* Only show defline when simple has not been run. This can still cause a problem when simple doesn't return any repeats */ if( simple ) findRepeats(seqP, True, True); else findRepeats(seqP, False, True); } free((void *)seqP); } /* Wim 20/10/03 */ return(EXIT_SUCCESS); }