Text Preprocessing

Basic Text Cleaning

# Import libraries
import re
import string
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer

# Download required NLTK data
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# Basic text cleaning function
def clean_text(text):
    # Convert to lowercase
    text = text.lower()
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    # Remove numbers
    text = re.sub(r'\d+', '', text)
    # Remove extra whitespace
    text = ' '.join(text.split())
    return text

Tokenization & Normalization

# Tokenization
text = "Natural Language Processing is amazing!"

# Word tokenization
tokens = word_tokenize(text)
print(tokens)
# Output: ['Natural', 'Language', 'Processing', 'is', 'amazing', '!']

# Sentence tokenization
from nltk.tokenize import sent_tokenize
sentences = sent_tokenize("This is sentence one. This is sentence two.")
print(sentences)

# Stopwords removal
stop_words = set(stopwords.words('english'))
filtered_tokens = [word for word in tokens if word.lower() not in stop_words]
# Stemming and Lemmatization
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

words = ["running", "ran", "runs", "easily", "fairly"]

# Stemming
stems = [stemmer.stem(word) for word in words]
print(stems)
# Output: ['run', 'ran', 'run', 'easili', 'fairli']

# Lemmatization
lemmas = [lemmatizer.lemmatize(word, pos='v') for word in words]
print(lemmas)
# Output: ['run', 'run', 'run', 'easily', 'fairly']

Feature Extraction

Bag of Words & TF-IDF

# Using scikit-learn
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

# Sample documents
documents = [
    "This is the first document.",
    "This document is the second document.",
    "And this is the third one.",
    "Is this the first document?"
]

# Bag of Words
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(documents)
print(vectorizer.get_feature_names_out())
print(X.toarray())

# TF-IDF
tfidf_vectorizer = TfidfVectorizer()
X_tfidf = tfidf_vectorizer.fit_transform(documents)
print(X_tfidf.toarray())
TF-IDF Formula

TF(t,d) = (Number of times term t appears in document d) / (Total number of terms in document d)

IDF(t) = log(Total number of documents / Number of documents containing term t)

TF-IDF(t,d) = TF(t,d) × IDF(t)

Word Embeddings

# Using pre-trained Word2Vec
import gensim.downloader as api

# Load pre-trained model
word_vectors = api.load("word2vec-google-news-300")

# Get vector for a word
vector = word_vectors["king"]
print(f"Vector dimension: {vector.shape}")

# Find similar words
similar_words = word_vectors.most_similar("king", topn=5)
for word, score in similar_words:
    print(f"{word}: {score:.4f}")

# Word analogy: king - man + woman = ?
result = word_vectors.most_similar(positive=["woman", "king"], negative=["man"])
print(result[0])
# Training custom Word2Vec model
from gensim.models import Word2Vec

# Tokenized sentences
sentences = [["this", "is", "sentence", "one"],
             ["this", "is", "sentence", "two"]]

# Train model
model = Word2Vec(sentences, vector_size=100, window=5, min_count=1, workers=4)

# Save and load model
model.save("word2vec.model")
model = Word2Vec.load("word2vec.model")

NLP Models

Traditional ML Models

# Text classification with Naive Bayes
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

# Prepare data
vectorizer = TfidfVectorizer(max_features=1000)
X = vectorizer.fit_transform(documents)
y = [0, 1, 0, 1] # Example labels

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train model
model = MultinomialNB()
model.fit(X_train, y_train)

# Predict and evaluate
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")
print(classification_report(y_test, y_pred))
Common ML Models for NLP

Naive Bayes - Fast, works well with high-dimensional data

Logistic Regression - Good baseline, interpretable

Support Vector Machines - Effective for text classification

Random Forests - Robust, handles non-linear relationships

Deep Learning Models

# Simple LSTM for text classification
import torch
import torch.nn as nn
import torch.optim as optim

class TextLSTM(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, n_layers, dropout):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, n_layers,
                           batch_first=True, dropout=dropout)
        self.fc = nn.Linear(hidden_dim, output_dim)
        self.dropout = nn.Dropout(dropout)
    
    def forward(self, text):
        embedded = self.embedding(text)
        output, (hidden, cell) = self.lstm(embedded)
        hidden = self.dropout(hidden[-1])
        return self.fc(hidden)

# Model parameters
VOCAB_SIZE = 10000
EMBEDDING_DIM = 100
HIDDEN_DIM = 256
OUTPUT_DIM = 2
N_LAYERS = 2
DROPOUT = 0.5

model = TextLSTM(VOCAB_SIZE, EMBEDDING_DIM, HIDDEN_DIM, OUTPUT_DIM, N_LAYERS, DROPOUT)

Transformers

BERT Implementation

# Using Hugging Face Transformers
from transformers import BertTokenizer, BertModel, BertForSequenceClassification
import torch

# Load pre-trained BERT model and tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

# Tokenize text
text = "This is a sample text for classification."
inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)

# Get model outputs
outputs = model(**inputs)
logits = outputs.logits

# Get predictions
predictions = torch.argmax(logits, dim=-1)
print(f"Predicted class: {predictions.item()}")
# Fine-tuning BERT for custom task
from transformers import Trainer, TrainingArguments

# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
)

# Create trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
)

# Fine-tune model
trainer.train()

GPT & Other Models

# Using GPT-2 for text generation
from transformers import GPT2Tokenizer, GPT2LMHeadModel

# Load pre-trained GPT-2
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model = GPT2LMHeadModel.from_pretrained('gpt2')

# Generate text
input_text = "The future of artificial intelligence"
input_ids = tokenizer.encode(input_text, return_tensors='pt')

# Generate text with various parameters
output = model.generate(
    input_ids,
    max_length=100,
    num_return_sequences=1,
    temperature=0.7,
    do_sample=True,
    pad_token_id=tokenizer.eos_token_id
)

generated_text = tokenizer.decode(output[0], skip_special_tokens=True)
print(generated_text)
Popular Transformer Models

BERT - Bidirectional Encoder Representations (Google)

GPT - Generative Pre-trained Transformer (OpenAI)

RoBERTa - Robustly Optimized BERT (Facebook)

DistilBERT - Smaller, faster version of BERT

T5 - Text-to-Text Transfer Transformer (Google)

Advanced NLP Tasks

Named Entity Recognition

# Using spaCy for NER
import spacy

# Load English model
nlp = spacy.load("en_core_web_sm")

# Process text
text = "Apple is looking at buying U.K. startup for $1 billion. Tim Cook is the CEO."
doc = nlp(text)

# Extract entities
for ent in doc.ents:
    print(ent.text, ent.label_, spacy.explain(ent.label_))

# Output:
# Apple ORG Companies, agencies, institutions, etc.
# U.K. GPE Countries, cities, states
# $1 billion MONEY Monetary values, including unit
# Tim Cook PERSON People, including fictional
# Using Transformers for NER
from transformers import AutoTokenizer, AutoModelForTokenClassification
from transformers import pipeline

# Load NER pipeline
ner_pipeline = pipeline("ner", model="dbmdz/bert-large-cased-finetuned-conll03-english")

# Perform NER
results = ner_pipeline("Hugging Face is a company based in New York City.")
for entity in results:
    print(f"{entity['word']}: {entity['entity']} (confidence: {entity['score']:.4f})")

Sentiment Analysis

# Using TextBlob for simple sentiment analysis
from textblob import TextBlob

# Analyze sentiment
text = "I love this product! It's amazing."
blob = TextBlob(text)

# Get sentiment polarity and subjectivity
polarity = blob.sentiment.polarity
subjectivity = blob.sentiment.subjectivity

print(f"Polarity: {polarity:.4f}") # Range: -1 (negative) to 1 (positive)
print(f"Subjectivity: {subjectivity:.4f}") # Range: 0 (objective) to 1 (subjective)

if polarity > 0:
    print("Positive sentiment")
elif polarity < 0:
    print("Negative sentiment")
else:
    print("Neutral sentiment")
# Using Transformers for advanced sentiment analysis
from transformers import pipeline

# Create sentiment analysis pipeline
classifier = pipeline("sentiment-analysis")

# Analyze multiple texts
texts = [
    "I love this product! It's amazing.",
    "This is the worst thing I've ever bought.",
    "It's okay, nothing special."
]

results = classifier(texts)
for text, result in zip(texts, results):
    print(f"Text: {text}")
    print(f"Sentiment: {result['label']}, Score: {result['score']:.4f}\n")

Additional Resources

Learning Resources

  • Books: "Speech and Language Processing" by Jurafsky & Martin
  • Courses: Stanford CS224N, Fast.ai NLP, Coursera NLP Specialization
  • Documentation: Hugging Face, spaCy, NLTK, Gensim
  • Research Papers: BERT, GPT, Transformer, Attention Is All You Need
  • Communities: Hugging Face, Papers with Code, Reddit ML/NLP

Useful Libraries

  • Core NLP: NLTK, spaCy, Stanford CoreNLP
  • Deep Learning: PyTorch, TensorFlow, Keras
  • Transformers: Hugging Face Transformers
  • Word Embeddings: Gensim, FastText
  • Utilities: TextBlob, Pattern, Scikit-learn