算法2026-04-27·9 分钟
算法知识库:自然语言处理基础算法实现
JavaScript/TypeScript 实现自然语言处理基础算法,如分词、TF-IDF、情感分析等。
自然语言处理基础算法实现
1. 简单分词器
ts
class SimpleTokenizer {
private stopWords: Set<string>;
constructor() {
this.stopWords = new Set(['the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for', 'of', 'with', 'by']);
}
tokenize(text: string): string[] {
return text
.toLowerCase()
.replace(/[^\w\s]/g, ' ')
.split(/\s+/)
.filter((word) => word.length > 0 && !this.stopWords.has(word));
}
addStopWord(word: string): void {
this.stopWords.add(word.toLowerCase());
}
removeStopWord(word: string): void {
this.stopWords.delete(word.toLowerCase());
}
}2. TF-IDF 向量器
ts
class TFIDFVectorizer {
private vocabulary: Map<string, number> = new Map();
private idf: Map<string, number> = new Map();
private documents: string[][] = [];
fit(documents: string[]): void {
this.documents = documents.map((doc) => this.tokenize(doc));
this.buildVocabulary();
this.computeIDF();
}
transform(documents: string[]): number[][] {
return documents.map((doc) => {
const tokens = this.tokenize(doc);
const vector = new Array(this.vocabulary.size).fill(0);
const termFreq = new Map<string, number>();
for (const token of tokens) {
termFreq.set(token, (termFreq.get(token) || 0) + 1);
}
for (const [term, freq] of termFreq) {
const index = this.vocabulary.get(term);
if (index !== undefined) {
const tf = freq / tokens.length;
const idf = this.idf.get(term) || 0;
vector[index] = tf * idf;
}
}
return vector;
});
}
fitTransform(documents: string[]): number[][] {
this.fit(documents);
return this.transform(documents);
}
private tokenize(text: string): string[] {
return text
.toLowerCase()
.replace(/[^\w\s]/g, ' ')
.split(/\s+/)
.filter((word) => word.length > 0);
}
private buildVocabulary(): void {
const allTerms = new Set<string>();
for (const doc of this.documents) {
for (const term of doc) {
allTerms.add(term);
}
}
let index = 0;
for (const term of allTerms) {
this.vocabulary.set(term, index);
index += 1;
}
}
private computeIDF(): void {
const numDocs = this.documents.length;
for (const [term, _] of this.vocabulary) {
let docCount = 0;
for (const doc of this.documents) {
if (doc.includes(term)) {
docCount += 1;
}
}
this.idf.set(term, Math.log(numDocs / (1 + docCount)));
}
}
getFeatureNames(): string[] {
return Array.from(this.vocabulary.keys());
}
}3. 余弦相似度
ts
function cosineSimilarity(vec1: number[], vec2: number[]): number {
if (vec1.length !== vec2.length) {
throw new Error('Vectors must have the same length');
}
let dotProduct = 0;
let norm1 = 0;
let norm2 = 0;
for (let i = 0; i < vec1.length; i += 1) {
dotProduct += vec1[i] * vec2[i];
norm1 += vec1[i] ** 2;
norm2 += vec2[i] ** 2;
}
norm1 = Math.sqrt(norm1);
norm2 = Math.sqrt(norm2);
if (norm1 === 0 || norm2 === 0) {
return 0;
}
return dotProduct / (norm1 * norm2);
}4. 简单情感分析
ts
class SentimentAnalyzer {
private positiveWords: Set<string>;
private negativeWords: Set<string>;
constructor() {
this.positiveWords = new Set(['good', 'great', 'excellent', 'amazing', 'wonderful', 'fantastic', 'love', 'like', 'best', 'awesome']);
this.negativeWords = new Set(['bad', 'terrible', 'awful', 'horrible', 'hate', 'dislike', 'worst', 'poor', 'ugly', 'stupid']);
}
analyze(text: string): { score: number; sentiment: 'positive' | 'negative' | 'neutral' } {
const tokens = this.tokenize(text);
let score = 0;
for (const token of tokens) {
if (this.positiveWords.has(token)) {
score += 1;
} else if (this.negativeWords.has(token)) {
score -= 1;
}
}
let sentiment: 'positive' | 'negative' | 'neutral';
if (score > 0) {
sentiment = 'positive';
} else if (score < 0) {
sentiment = 'negative';
} else {
sentiment = 'neutral';
}
return { score, sentiment };
}
private tokenize(text: string): string[] {
return text
.toLowerCase()
.replace(/[^\w\s]/g, ' ')
.split(/\s+/)
.filter((word) => word.length > 0);
}
addPositiveWord(word: string): void {
this.positiveWords.add(word.toLowerCase());
}
addNegativeWord(word: string): void {
this.negativeWords.add(word.toLowerCase());
}
}5. N-gram 生成器
ts
function generateNGrams(text: string, n: number): string[] {
const tokens = text
.toLowerCase()
.replace(/[^\w\s]/g, ' ')
.split(/\s+/)
.filter((word) => word.length > 0);
const ngrams: string[] = [];
for (let i = 0; i <= tokens.length - n; i += 1) {
ngrams.push(tokens.slice(i, i + n).join(' '));
}
return ngrams;
}6. 简单文本分类器 (朴素贝叶斯)
ts
class NaiveBayesClassifier {
private wordCounts: Map<string, Map<string, number>> = new Map();
private classCounts: Map<string, number> = new Map();
private vocabulary: Set<string> = new Set();
private totalDocuments = 0;
train(documents: { text: string; label: string }[]): void {
for (const doc of documents) {
this.totalDocuments += 1;
this.classCounts.set(doc.label, (this.classCounts.get(doc.label) || 0) + 1);
const tokens = this.tokenize(doc.text);
for (const token of tokens) {
this.vocabulary.add(token);
if (!this.wordCounts.has(doc.label)) {
this.wordCounts.set(doc.label, new Map());
}
const labelCounts = this.wordCounts.get(doc.label)!;
labelCounts.set(token, (labelCounts.get(token) || 0) + 1);
}
}
}
predict(text: string): string {
const tokens = this.tokenize(text);
let bestLabel = '';
let bestScore = -Infinity;
for (const label of this.classCounts.keys()) {
let score = Math.log((this.classCounts.get(label) || 0) / this.totalDocuments);
for (const token of tokens) {
const wordProb = this.getWordProbability(token, label);
score += Math.log(wordProb);
}
if (score > bestScore) {
bestScore = score;
bestLabel = label;
}
}
return bestLabel;
}
private tokenize(text: string): string[] {
return text
.toLowerCase()
.replace(/[^\w\s]/g, ' ')
.split(/\s+/)
.filter((word) => word.length > 0);
}
private getWordProbability(word: string, label: string): number {
const labelCounts = this.wordCounts.get(label);
if (!labelCounts) return 1 / (this.vocabulary.size + 1);
const wordCount = labelCounts.get(word) || 0;
const totalWords = Array.from(labelCounts.values()).reduce((sum, count) => sum + count, 0);
return (wordCount + 1) / (totalWords + this.vocabulary.size);
}
}7. 实现要点
- 分词是 NLP 的基础步骤。
- TF-IDF 衡量词的重要性。
- 余弦相似度计算文本相似性。
- 情感分析基于词典方法。
- 朴素贝叶斯适合文本分类。
算法自然语言处理JavaScript