Text Preprocessing
Basic Text Cleaning
import re
import string
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer
# Download required NLTK data
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
# Basic text cleaning function
def clean_text(text):
# Convert to lowercase
text = text.lower()
# Remove punctuation
text = text.translate(str.maketrans('', '', string.punctuation))
# Remove numbers
text = re.sub(r'\d+', '', text)
# Remove extra whitespace
text = ' '.join(text.split())
return text
Tokenization & Normalization
text = "Natural Language Processing is amazing!"
# Word tokenization
tokens = word_tokenize(text)
print(tokens)
# Output: ['Natural', 'Language', 'Processing', 'is', 'amazing', '!']
# Sentence tokenization
from nltk.tokenize import sent_tokenize
sentences = sent_tokenize("This is sentence one. This is sentence two.")
print(sentences)
# Stopwords removal
stop_words = set(stopwords.words('english'))
filtered_tokens = [word for word in tokens if word.lower() not in stop_words]
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()
words = ["running", "ran", "runs", "easily", "fairly"]
# Stemming
stems = [stemmer.stem(word) for word in words]
print(stems)
# Output: ['run', 'ran', 'run', 'easili', 'fairli']
# Lemmatization
lemmas = [lemmatizer.lemmatize(word, pos='v') for word in words]
print(lemmas)
# Output: ['run', 'run', 'run', 'easily', 'fairly']
Feature Extraction
Bag of Words & TF-IDF
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
# Sample documents
documents = [
"This is the first document.",
"This document is the second document.",
"And this is the third one.",
"Is this the first document?"
]
# Bag of Words
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(documents)
print(vectorizer.get_feature_names_out())
print(X.toarray())
# TF-IDF
tfidf_vectorizer = TfidfVectorizer()
X_tfidf = tfidf_vectorizer.fit_transform(documents)
print(X_tfidf.toarray())
TF(t,d) = (Number of times term t appears in document d) / (Total number of terms in document d)
IDF(t) = log(Total number of documents / Number of documents containing term t)
TF-IDF(t,d) = TF(t,d) × IDF(t)
Word Embeddings
import gensim.downloader as api
# Load pre-trained model
word_vectors = api.load("word2vec-google-news-300")
# Get vector for a word
vector = word_vectors["king"]
print(f"Vector dimension: {vector.shape}")
# Find similar words
similar_words = word_vectors.most_similar("king", topn=5)
for word, score in similar_words:
print(f"{word}: {score:.4f}")
# Word analogy: king - man + woman = ?
result = word_vectors.most_similar(positive=["woman", "king"], negative=["man"])
print(result[0])
from gensim.models import Word2Vec
# Tokenized sentences
sentences = [["this", "is", "sentence", "one"],
["this", "is", "sentence", "two"]]
# Train model
model = Word2Vec(sentences, vector_size=100, window=5, min_count=1, workers=4)
# Save and load model
model.save("word2vec.model")
model = Word2Vec.load("word2vec.model")
NLP Models
Traditional ML Models
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
# Prepare data
vectorizer = TfidfVectorizer(max_features=1000)
X = vectorizer.fit_transform(documents)
y = [0, 1, 0, 1] # Example labels
# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Train model
model = MultinomialNB()
model.fit(X_train, y_train)
# Predict and evaluate
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")
print(classification_report(y_test, y_pred))
Naive Bayes - Fast, works well with high-dimensional data
Logistic Regression - Good baseline, interpretable
Support Vector Machines - Effective for text classification
Random Forests - Robust, handles non-linear relationships
Deep Learning Models
import torch
import torch.nn as nn
import torch.optim as optim
class TextLSTM(nn.Module):
def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, n_layers, dropout):
super().__init__()
self.embedding = nn.Embedding(vocab_size, embedding_dim)
self.lstm = nn.LSTM(embedding_dim, hidden_dim, n_layers,
batch_first=True, dropout=dropout)
self.fc = nn.Linear(hidden_dim, output_dim)
self.dropout = nn.Dropout(dropout)
def forward(self, text):
embedded = self.embedding(text)
output, (hidden, cell) = self.lstm(embedded)
hidden = self.dropout(hidden[-1])
return self.fc(hidden)
# Model parameters
VOCAB_SIZE = 10000
EMBEDDING_DIM = 100
HIDDEN_DIM = 256
OUTPUT_DIM = 2
N_LAYERS = 2
DROPOUT = 0.5
model = TextLSTM(VOCAB_SIZE, EMBEDDING_DIM, HIDDEN_DIM, OUTPUT_DIM, N_LAYERS, DROPOUT)
Transformers
BERT Implementation
from transformers import BertTokenizer, BertModel, BertForSequenceClassification
import torch
# Load pre-trained BERT model and tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)
# Tokenize text
text = "This is a sample text for classification."
inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
# Get model outputs
outputs = model(**inputs)
logits = outputs.logits
# Get predictions
predictions = torch.argmax(logits, dim=-1)
print(f"Predicted class: {predictions.item()}")
from transformers import Trainer, TrainingArguments
# Define training arguments
training_args = TrainingArguments(
output_dir='./results',
num_train_epochs=3,
per_device_train_batch_size=16,
per_device_eval_batch_size=64,
warmup_steps=500,
weight_decay=0.01,
logging_dir='./logs',
logging_steps=10,
)
# Create trainer
trainer = Trainer(
model=model,
args=training_args,
train_dataset=train_dataset,
eval_dataset=eval_dataset,
)
# Fine-tune model
trainer.train()
GPT & Other Models
from transformers import GPT2Tokenizer, GPT2LMHeadModel
# Load pre-trained GPT-2
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model = GPT2LMHeadModel.from_pretrained('gpt2')
# Generate text
input_text = "The future of artificial intelligence"
input_ids = tokenizer.encode(input_text, return_tensors='pt')
# Generate text with various parameters
output = model.generate(
input_ids,
max_length=100,
num_return_sequences=1,
temperature=0.7,
do_sample=True,
pad_token_id=tokenizer.eos_token_id
)
generated_text = tokenizer.decode(output[0], skip_special_tokens=True)
print(generated_text)
BERT - Bidirectional Encoder Representations (Google)
GPT - Generative Pre-trained Transformer (OpenAI)
RoBERTa - Robustly Optimized BERT (Facebook)
DistilBERT - Smaller, faster version of BERT
T5 - Text-to-Text Transfer Transformer (Google)
Advanced NLP Tasks
Named Entity Recognition
import spacy
# Load English model
nlp = spacy.load("en_core_web_sm")
# Process text
text = "Apple is looking at buying U.K. startup for $1 billion. Tim Cook is the CEO."
doc = nlp(text)
# Extract entities
for ent in doc.ents:
print(ent.text, ent.label_, spacy.explain(ent.label_))
# Output:
# Apple ORG Companies, agencies, institutions, etc.
# U.K. GPE Countries, cities, states
# $1 billion MONEY Monetary values, including unit
# Tim Cook PERSON People, including fictional
from transformers import AutoTokenizer, AutoModelForTokenClassification
from transformers import pipeline
# Load NER pipeline
ner_pipeline = pipeline("ner", model="dbmdz/bert-large-cased-finetuned-conll03-english")
# Perform NER
results = ner_pipeline("Hugging Face is a company based in New York City.")
for entity in results:
print(f"{entity['word']}: {entity['entity']} (confidence: {entity['score']:.4f})")
Sentiment Analysis
from textblob import TextBlob
# Analyze sentiment
text = "I love this product! It's amazing."
blob = TextBlob(text)
# Get sentiment polarity and subjectivity
polarity = blob.sentiment.polarity
subjectivity = blob.sentiment.subjectivity
print(f"Polarity: {polarity:.4f}") # Range: -1 (negative) to 1 (positive)
print(f"Subjectivity: {subjectivity:.4f}") # Range: 0 (objective) to 1 (subjective)
if polarity > 0:
print("Positive sentiment")
elif polarity < 0:
print("Negative sentiment")
else:
print("Neutral sentiment")
from transformers import pipeline
# Create sentiment analysis pipeline
classifier = pipeline("sentiment-analysis")
# Analyze multiple texts
texts = [
"I love this product! It's amazing.",
"This is the worst thing I've ever bought.",
"It's okay, nothing special."
]
results = classifier(texts)
for text, result in zip(texts, results):
print(f"Text: {text}")
print(f"Sentiment: {result['label']}, Score: {result['score']:.4f}\n")
Additional Resources
Learning Resources
- Books: "Speech and Language Processing" by Jurafsky & Martin
- Courses: Stanford CS224N, Fast.ai NLP, Coursera NLP Specialization
- Documentation: Hugging Face, spaCy, NLTK, Gensim
- Research Papers: BERT, GPT, Transformer, Attention Is All You Need
- Communities: Hugging Face, Papers with Code, Reddit ML/NLP
Useful Libraries
- Core NLP: NLTK, spaCy, Stanford CoreNLP
- Deep Learning: PyTorch, TensorFlow, Keras
- Transformers: Hugging Face Transformers
- Word Embeddings: Gensim, FastText
- Utilities: TextBlob, Pattern, Scikit-learn