自然语言处理基础算法实现

1. 简单分词器

class SimpleTokenizer {
  private stopWords: Set<string>;

  constructor() {
    this.stopWords = new Set(['the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for', 'of', 'with', 'by']);
  }

  tokenize(text: string): string[] {
    return text
      .toLowerCase()
      .replace(/[^\w\s]/g, ' ')
      .split(/\s+/)
      .filter((word) => word.length > 0 && !this.stopWords.has(word));
  }

  addStopWord(word: string): void {
    this.stopWords.add(word.toLowerCase());
  }

  removeStopWord(word: string): void {
    this.stopWords.delete(word.toLowerCase());
  }
}

2. TF-IDF 向量器

class TFIDFVectorizer {
  private vocabulary: Map<string, number> = new Map();
  private idf: Map<string, number> = new Map();
  private documents: string[][] = [];

  fit(documents: string[]): void {
    this.documents = documents.map((doc) => this.tokenize(doc));
    this.buildVocabulary();
    this.computeIDF();
  }

  transform(documents: string[]): number[][] {
    return documents.map((doc) => {
      const tokens = this.tokenize(doc);
      const vector = new Array(this.vocabulary.size).fill(0);

      const termFreq = new Map<string, number>();
      for (const token of tokens) {
        termFreq.set(token, (termFreq.get(token) || 0) + 1);
      }

      for (const [term, freq] of termFreq) {
        const index = this.vocabulary.get(term);
        if (index !== undefined) {
          const tf = freq / tokens.length;
          const idf = this.idf.get(term) || 0;
          vector[index] = tf * idf;
        }
      }

      return vector;
    });
  }

  fitTransform(documents: string[]): number[][] {
    this.fit(documents);
    return this.transform(documents);
  }

  private tokenize(text: string): string[] {
    return text
      .toLowerCase()
      .replace(/[^\w\s]/g, ' ')
      .split(/\s+/)
      .filter((word) => word.length > 0);
  }

  private buildVocabulary(): void {
    const allTerms = new Set<string>();
    for (const doc of this.documents) {
      for (const term of doc) {
        allTerms.add(term);
      }
    }

    let index = 0;
    for (const term of allTerms) {
      this.vocabulary.set(term, index);
      index += 1;
    }
  }

  private computeIDF(): void {
    const numDocs = this.documents.length;

    for (const [term, _] of this.vocabulary) {
      let docCount = 0;
      for (const doc of this.documents) {
        if (doc.includes(term)) {
          docCount += 1;
        }
      }

      this.idf.set(term, Math.log(numDocs / (1 + docCount)));
    }
  }

  getFeatureNames(): string[] {
    return Array.from(this.vocabulary.keys());
  }
}

3. 余弦相似度

function cosineSimilarity(vec1: number[], vec2: number[]): number {
  if (vec1.length !== vec2.length) {
    throw new Error('Vectors must have the same length');
  }

  let dotProduct = 0;
  let norm1 = 0;
  let norm2 = 0;

  for (let i = 0; i < vec1.length; i += 1) {
    dotProduct += vec1[i] * vec2[i];
    norm1 += vec1[i] ** 2;
    norm2 += vec2[i] ** 2;
  }

  norm1 = Math.sqrt(norm1);
  norm2 = Math.sqrt(norm2);

  if (norm1 === 0 || norm2 === 0) {
    return 0;
  }

  return dotProduct / (norm1 * norm2);
}

4. 简单情感分析

class SentimentAnalyzer {
  private positiveWords: Set<string>;
  private negativeWords: Set<string>;

  constructor() {
    this.positiveWords = new Set(['good', 'great', 'excellent', 'amazing', 'wonderful', 'fantastic', 'love', 'like', 'best', 'awesome']);

    this.negativeWords = new Set(['bad', 'terrible', 'awful', 'horrible', 'hate', 'dislike', 'worst', 'poor', 'ugly', 'stupid']);
  }

  analyze(text: string): { score: number; sentiment: 'positive' | 'negative' | 'neutral' } {
    const tokens = this.tokenize(text);
    let score = 0;

    for (const token of tokens) {
      if (this.positiveWords.has(token)) {
        score += 1;
      } else if (this.negativeWords.has(token)) {
        score -= 1;
      }
    }

    let sentiment: 'positive' | 'negative' | 'neutral';
    if (score > 0) {
      sentiment = 'positive';
    } else if (score < 0) {
      sentiment = 'negative';
    } else {
      sentiment = 'neutral';
    }

    return { score, sentiment };
  }

  private tokenize(text: string): string[] {
    return text
      .toLowerCase()
      .replace(/[^\w\s]/g, ' ')
      .split(/\s+/)
      .filter((word) => word.length > 0);
  }

  addPositiveWord(word: string): void {
    this.positiveWords.add(word.toLowerCase());
  }

  addNegativeWord(word: string): void {
    this.negativeWords.add(word.toLowerCase());
  }
}

5. N-gram 生成器

function generateNGrams(text: string, n: number): string[] {
  const tokens = text
    .toLowerCase()
    .replace(/[^\w\s]/g, ' ')
    .split(/\s+/)
    .filter((word) => word.length > 0);

  const ngrams: string[] = [];

  for (let i = 0; i <= tokens.length - n; i += 1) {
    ngrams.push(tokens.slice(i, i + n).join(' '));
  }

  return ngrams;
}

6. 简单文本分类器 (朴素贝叶斯)

class NaiveBayesClassifier {
  private wordCounts: Map<string, Map<string, number>> = new Map();
  private classCounts: Map<string, number> = new Map();
  private vocabulary: Set<string> = new Set();
  private totalDocuments = 0;

  train(documents: { text: string; label: string }[]): void {
    for (const doc of documents) {
      this.totalDocuments += 1;
      this.classCounts.set(doc.label, (this.classCounts.get(doc.label) || 0) + 1);

      const tokens = this.tokenize(doc.text);
      for (const token of tokens) {
        this.vocabulary.add(token);

        if (!this.wordCounts.has(doc.label)) {
          this.wordCounts.set(doc.label, new Map());
        }

        const labelCounts = this.wordCounts.get(doc.label)!;
        labelCounts.set(token, (labelCounts.get(token) || 0) + 1);
      }
    }
  }

  predict(text: string): string {
    const tokens = this.tokenize(text);
    let bestLabel = '';
    let bestScore = -Infinity;

    for (const label of this.classCounts.keys()) {
      let score = Math.log((this.classCounts.get(label) || 0) / this.totalDocuments);

      for (const token of tokens) {
        const wordProb = this.getWordProbability(token, label);
        score += Math.log(wordProb);
      }

      if (score > bestScore) {
        bestScore = score;
        bestLabel = label;
      }
    }

    return bestLabel;
  }

  private tokenize(text: string): string[] {
    return text
      .toLowerCase()
      .replace(/[^\w\s]/g, ' ')
      .split(/\s+/)
      .filter((word) => word.length > 0);
  }

  private getWordProbability(word: string, label: string): number {
    const labelCounts = this.wordCounts.get(label);
    if (!labelCounts) return 1 / (this.vocabulary.size + 1);

    const wordCount = labelCounts.get(word) || 0;
    const totalWords = Array.from(labelCounts.values()).reduce((sum, count) => sum + count, 0);

    return (wordCount + 1) / (totalWords + this.vocabulary.size);
  }
}

7. 实现要点

分词是 NLP 的基础步骤。
TF-IDF 衡量词的重要性。
余弦相似度计算文本相似性。
情感分析基于词典方法。
朴素贝叶斯适合文本分类。

算法知识库：自然语言处理基础算法实现

自然语言处理基础算法实现

1. 简单分词器

2. TF-IDF 向量器

3. 余弦相似度

4. 简单情感分析

5. N-gram 生成器

6. 简单文本分类器 (朴素贝叶斯)

7. 实现要点