生物信息学算法实现

1. Needleman-Wunsch 全局序列比对

function needlemanWunsch(seq1: string, seq2: string, match: number = 1, mismatch: number = -1, gap: number = -2): { score: number; alignment: [string, string] } {
  const m = seq1.length;
  const n = seq2.length;

  // 初始化得分矩阵
  const score = Array.from({ length: m + 1 }, () => Array(n + 1).fill(0));

  // 初始化第一行和第一列
  for (let i = 1; i <= m; i += 1) {
    score[i][0] = score[i - 1][0] + gap;
  }
  for (let j = 1; j <= n; j += 1) {
    score[0][j] = score[0][j - 1] + gap;
  }

  // 填充得分矩阵
  for (let i = 1; i <= m; i += 1) {
    for (let j = 1; j <= n; j += 1) {
      const matchScore = seq1[i - 1] === seq2[j - 1] ? match : mismatch;
      const diagonal = score[i - 1][j - 1] + matchScore;
      const up = score[i - 1][j] + gap;
      const left = score[i][j - 1] + gap;

      score[i][j] = Math.max(diagonal, up, left);
    }
  }

  // 回溯构造比对
  const alignment1: string[] = [];
  const alignment2: string[] = [];
  let i = m;
  let j = n;

  while (i > 0 || j > 0) {
    if (i > 0 && j > 0 && score[i][j] === score[i - 1][j - 1] + (seq1[i - 1] === seq2[j - 1] ? match : mismatch)) {
      alignment1.unshift(seq1[i - 1]);
      alignment2.unshift(seq2[j - 1]);
      i -= 1;
      j -= 1;
    } else if (i > 0 && score[i][j] === score[i - 1][j] + gap) {
      alignment1.unshift(seq1[i - 1]);
      alignment2.unshift('-');
      i -= 1;
    } else {
      alignment1.unshift('-');
      alignment2.unshift(seq2[j - 1]);
      j -= 1;
    }
  }

  return {
    score: score[m][n],
    alignment: [alignment1.join(''), alignment2.join('')],
  };
}

2. Smith-Waterman 局部序列比对

function smithWaterman(seq1: string, seq2: string, match: number = 1, mismatch: number = -1, gap: number = -2): { score: number; alignment: [string, string]; startPositions: [number, number] } {
  const m = seq1.length;
  const n = seq2.length;

  const score = Array.from({ length: m + 1 }, () => Array(n + 1).fill(0));
  let maxScore = 0;
  let maxI = 0;
  let maxJ = 0;

  // 填充得分矩阵
  for (let i = 1; i <= m; i += 1) {
    for (let j = 1; j <= n; j += 1) {
      const matchScore = seq1[i - 1] === seq2[j - 1] ? match : mismatch;
      const diagonal = score[i - 1][j - 1] + matchScore;
      const up = score[i - 1][j] + gap;
      const left = score[i][j - 1] + gap;

      score[i][j] = Math.max(0, diagonal, up, left);

      if (score[i][j] > maxScore) {
        maxScore = score[i][j];
        maxI = i;
        maxJ = j;
      }
    }
  }

  // 从最大得分位置回溯
  const alignment1: string[] = [];
  const alignment2: string[] = [];
  let i = maxI;
  let j = maxJ;

  while (i > 0 && j > 0 && score[i][j] > 0) {
    if (score[i][j] === score[i - 1][j - 1] + (seq1[i - 1] === seq2[j - 1] ? match : mismatch)) {
      alignment1.unshift(seq1[i - 1]);
      alignment2.unshift(seq2[j - 1]);
      i -= 1;
      j -= 1;
    } else if (score[i][j] === score[i - 1][j] + gap) {
      alignment1.unshift(seq1[i - 1]);
      alignment2.unshift('-');
      i -= 1;
    } else if (score[i][j] === score[i][j - 1] + gap) {
      alignment1.unshift('-');
      alignment2.unshift(seq2[j - 1]);
      j -= 1;
    } else {
      break;
    }
  }

  return {
    score: maxScore,
    alignment: [alignment1.join(''), alignment2.join('')],
    startPositions: [i, j],
  };
}

3. KMP 字符串匹配 (用于序列搜索)

function computePrefixFunction(pattern: string): number[] {
  const m = pattern.length;
  const pi = new Array(m).fill(0);
  let k = 0;

  for (let i = 1; i < m; i += 1) {
    while (k > 0 && pattern[k] !== pattern[i]) {
      k = pi[k - 1];
    }
    if (pattern[k] === pattern[i]) {
      k += 1;
    }
    pi[i] = k;
  }

  return pi;
}

function kmpSearch(text: string, pattern: string): number[] {
  const n = text.length;
  const m = pattern.length;
  const pi = computePrefixFunction(pattern);
  const matches: number[] = [];

  let q = 0; // 匹配的字符数
  for (let i = 0; i < n; i += 1) {
    while (q > 0 && pattern[q] !== text[i]) {
      q = pi[q - 1];
    }
    if (pattern[q] === text[i]) {
      q += 1;
    }
    if (q === m) {
      matches.push(i - m + 1);
      q = pi[q - 1];
    }
  }

  return matches;
}

4. 动态规划序列比对 (简化版)

function sequenceAlignment(seq1: string, seq2: string): { score: number; alignedSeq1: string; alignedSeq2: string } {
  const m = seq1.length;
  const n = seq2.length;

  // 得分矩阵
  const dp = Array.from({ length: m + 1 }, () => Array(n + 1).fill(0));

  // 初始化
  for (let i = 1; i <= m; i += 1) {
    dp[i][0] = dp[i - 1][0] - 1; // 缺口惩罚
  }
  for (let j = 1; j <= n; j += 1) {
    dp[0][j] = dp[0][j - 1] - 1;
  }

  // 填充 DP 表
  for (let i = 1; i <= m; i += 1) {
    for (let j = 1; j <= n; j += 1) {
      const match = seq1[i - 1] === seq2[j - 1] ? 1 : -1;
      const diagonal = dp[i - 1][j - 1] + match;
      const up = dp[i - 1][j] - 1;
      const left = dp[i][j - 1] - 1;

      dp[i][j] = Math.max(diagonal, up, left);
    }
  }

  // 回溯构造比对
  let alignedSeq1 = '';
  let alignedSeq2 = '';
  let i = m;
  let j = n;

  while (i > 0 || j > 0) {
    if (i > 0 && j > 0 && dp[i][j] === dp[i - 1][j - 1] + (seq1[i - 1] === seq2[j - 1] ? 1 : -1)) {
      alignedSeq1 = seq1[i - 1] + alignedSeq1;
      alignedSeq2 = seq2[j - 1] + alignedSeq2;
      i -= 1;
      j -= 1;
    } else if (i > 0 && dp[i][j] === dp[i - 1][j] - 1) {
      alignedSeq1 = seq1[i - 1] + alignedSeq1;
      alignedSeq2 = '-' + alignedSeq2;
      i -= 1;
    } else {
      alignedSeq1 = '-' + alignedSeq1;
      alignedSeq2 = seq2[j - 1] + alignedSeq2;
      j -= 1;
    }
  }

  return {
    score: dp[m][n],
    alignedSeq1,
    alignedSeq2,
  };
}

5. 基因组组装 (重叠-布局-共识简化版)

class GenomeAssembler {
  private reads: string[];

  constructor(reads: string[]) {
    this.reads = reads;
  }

  assemble(): string {
    if (this.reads.length === 0) return '';

    let currentContig = this.reads[0];
    const remainingReads = [...this.reads.slice(1)];

    while (remainingReads.length > 0) {
      let bestOverlap = -1;
      let bestReadIndex = -1;
      let bestPosition = -1; // 0: 前缀, 1: 后缀

      for (let i = 0; i < remainingReads.length; i += 1) {
        const read = remainingReads[i];

        // 检查前缀重叠
        const prefixOverlap = this.findOverlap(currentContig, read, true);
        if (prefixOverlap > bestOverlap) {
          bestOverlap = prefixOverlap;
          bestReadIndex = i;
          bestPosition = 0;
        }

        // 检查后缀重叠
        const suffixOverlap = this.findOverlap(currentContig, read, false);
        if (suffixOverlap > bestOverlap) {
          bestOverlap = suffixOverlap;
          bestReadIndex = i;
          bestPosition = 1;
        }
      }

      if (bestOverlap < 10) break; // 最小重叠长度

      const bestRead = remainingReads[bestReadIndex];
      if (bestPosition === 0) {
        // 添加到开头
        currentContig = bestRead + currentContig.slice(bestOverlap);
      } else {
        // 添加到结尾
        currentContig = currentContig.slice(0, currentContig.length - bestOverlap) + bestRead;
      }

      remainingReads.splice(bestReadIndex, 1);
    }

    return currentContig;
  }

  private findOverlap(seq1: string, seq2: string, seq1IsPrefix: boolean): number {
    const [shorter, longer] = seq1IsPrefix ? [seq2, seq1] : [seq1, seq2];
    const maxOverlap = Math.min(shorter.length, longer.length);

    for (let overlap = maxOverlap; overlap >= 1; overlap -= 1) {
      const shorterSuffix = shorter.slice(shorter.length - overlap);
      const longerPrefix = longer.slice(0, overlap);

      if (shorterSuffix === longerPrefix) {
        return overlap;
      }
    }

    return 0;
  }
}

6. 实现要点

Needleman-Wunsch 用于全局比对。
Smith-Waterman 用于局部比对。
KMP 提供高效的序列搜索。
基因组组装处理重叠片段。

算法知识库：生物信息学算法实现

生物信息学算法实现

1. Needleman-Wunsch 全局序列比对

2. Smith-Waterman 局部序列比对

3. KMP 字符串匹配 (用于序列搜索)

4. 动态规划序列比对 (简化版)

5. 基因组组装 (重叠-布局-共识简化版)

6. 实现要点