Text Preprocessing
Basic Text Cleaning
import re
import string
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer
# Download required NLTK data
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
# Basic text cleaning function
def clean_text(text):
# Convert to lowercase
text = text.lower()
# Remove punctuation
text = text.translate(str.maketrans('', '', string.punctuation))
# Remove numbers
text = re.sub(r'\d+', '', text)
# Remove extra whitespace
text = ' '.join(text.split())
return text
Tokenization & Normalization
text = "Natural Language Processing is amazing!"
# Word tokenization
tokens = word_tokenize(text)
print(tokens)
# Output: ['Natural', 'Language', 'Processing', 'is', 'amazing', '!']
# Sentence tokenization
from nltk.tokenize import sent_tokenize
sentences = sent_tokenize("This is sentence one. This is sentence two.")
print(sentences)
# Stopwords removal
stop_words = set(stopwords.words('english'))
filtered_tokens = [word for word in tokens if word.lower() not in stop_words]
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()
words = ["running", "ran", "runs", "easily", "fairly"]
# Stemming
stems = [stemmer.stem(word) for word in words]
print(stems)
# Output: ['run', 'ran', 'run', 'easili', 'fairli']
# Lemmatization
lemmas = [lemmatizer.lemmatize(word, pos='v') for word in words]
print(lemmas)
# Output: ['run', 'run', 'run', 'easily', 'fairly']
Feature Extraction
Bag of Words & TF-IDF
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
# Sample documents
documents = [
"This is the first document.",
"This document is the second document.",
"And this is the third one.",
"Is this the first document?"
]
# Bag of Words
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(documents)
print(vectorizer.get_feature_names_out())
print(X.toarray())
# TF-IDF
tfidf_vectorizer = TfidfVectorizer()
X_tfidf = tfidf_vectorizer.fit_transform(documents)
print(X_tfidf.toarray())
TF(t,d) = (Number of times term t appears in document d) / (Total number of terms in document d)
IDF(t) = log(Total number of documents / Number of documents containing term t)
TF-IDF(t,d) = TF(t,d) × IDF(t)
Word Embeddings
import gensim.downloader as api
# Load pre-trained model
word_vectors = api.load("word2vec-google-news-300")
# Get vector for a word
vector = word_vectors["king"]
print(f"Vector dimension: {vector.shape}")
# Find similar words
similar_words = word_vectors.most_similar("king", topn=5)
for word, score in similar_words:
print(f"{word}: {score:.4f}")
# Word analogy: king - man + woman = ?
result = word_vectors.most_similar(positive=["woman", "king"], negative=["man"])
print(result[0])
from gensim.models import Word2Vec
# Tokenized sentences
sentences = [["this", "is", "sentence", "one"],
["this", "is", "sentence", "two"]]
# Train model
model = Word2Vec(sentences, vector_size=100, window=5, min_count=1, workers=4)
# Save and load model
model.save("word2vec.model")
model = Word2Vec.load("word2vec.model")
NLP Models
Traditional ML Models
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
# Prepare data
vectorizer = TfidfVectorizer(max_features=1000)
X = vectorizer.fit_transform(documents)
y = [0, 1, 0, 1] # Example labels
# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Train model
model = MultinomialNB()
model.fit(X_train, y_train)
# Predict and evaluate
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")
print(classification_report(y_test, y_pred))
Naive Bayes - Fast, works well with high-dimensional data
Logistic Regression - Good baseline, interpretable
Support Vector Machines - Effective for text classification
Random Forests - Robust, handles non-linear relationships
Deep Learning Models
import torch
import torch.nn as nn
import torch.optim as optim
class TextLSTM(nn.Module):
def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, n_layers, dropout):
super().__init__()
self.embedding = nn.Embedding(vocab_size, embedding_dim)
self.lstm = nn.LSTM(embedding_dim, hidden_dim, n_layers,
batch_first=True, dropout=dropout)
self.fc = nn.Linear(hidden_dim, output_dim)
self.dropout = nn.Dropout(dropout)
def forward(self, text):
embedded = self.embedding(text)
output, (hidden, cell) = self.lstm(embedded)
hidden = self.dropout(hidden[-1])
return self.fc(hidden)
# Model parameters
VOCAB_SIZE = 10000
EMBEDDING_DIM = 100
HIDDEN_DIM = 256
OUTPUT_DIM = 2
N_LAYERS = 2
DROPOUT = 0.5
model = TextLSTM(VOCAB_SIZE, EMBEDDING_DIM, HIDDEN_DIM, OUTPUT_DIM, N_LAYERS, DROPOUT)
Transformers
BERT Implementation
from transformers import BertTokenizer, BertModel, BertForSequenceClassification
import torch
# Load pre-trained BERT model and tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)
# Tokenize text
text = "This is a sample text for classification."
inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
# Get model outputs
outputs = model(**inputs)
logits = outputs.logits
# Get predictions
predictions = torch.argmax(logits, dim=-1)
print(f"Predicted class: {predictions.item()}")
from transformers import Trainer, TrainingArguments
# Define training arguments
training_args = TrainingArguments(
output_dir='./results',
num_train_epochs=3,
per_device_train_batch_size=16,
per_device_eval_batch_size=64,
warmup_steps=500,
weight_decay=0.01,
logging_dir='./logs',
logging_steps=10,
)
# Create trainer
trainer = Trainer(
model=model,
args=training_args,
train_dataset=train_dataset,
eval_dataset=eval_dataset,
)
# Fine-tune model
trainer.train()
GPT & Other Models
from transformers import GPT2Tokenizer, GPT2LMHeadModel
# Load pre-trained GPT-2
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model = GPT2LMHeadModel.from_pretrained('gpt2')
# Generate text
input_text = "The future of artificial intelligence"
input_ids = tokenizer.encode(input_text, return_tensors='pt')
# Generate text with various parameters
output = model.generate(
input_ids,
max_length=100,
num_return_sequences=1,
temperature=0.7,
do_sample=True,
pad_token_id=tokenizer.eos_token_id
)
generated_text = tokenizer.decode(output[0], skip_special_tokens=True)
print(generated_text)
BERT - Bidirectional Encoder Representations (Google)
GPT - Generative Pre-trained Transformer (OpenAI)
RoBERTa - Robustly Optimized BERT (Facebook)
DistilBERT - Smaller, faster version of BERT
T5 - Text-to-Text Transfer Transformer (Google)
Advanced NLP Tasks
Named Entity Recognition
import spacy
# Load English model
nlp = spacy.load("en_core_web_sm")
# Process text
text = "Apple is looking at buying U.K. startup for $1 billion. Tim Cook is the CEO."
doc = nlp(text)
# Extract entities
for ent in doc.ents:
print(ent.text, ent.label_, spacy.explain(ent.label_))
# Output:
# Apple ORG Companies, agencies, institutions, etc.
# U.K. GPE Countries, cities, states
# $1 billion MONEY Monetary values, including unit
# Tim Cook PERSON People, including fictional
from transformers import AutoTokenizer, AutoModelForTokenClassification
from transformers import pipeline
# Load NER pipeline
ner_pipeline = pipeline("ner", model="dbmdz/bert-large-cased-finetuned-conll03-english")
# Perform NER
results = ner_pipeline("Hugging Face is a company based in New York City.")
for entity in results:
print(f"{entity['word']}: {entity['entity']} (confidence: {entity['score']:.4f})")
Sentiment Analysis
from textblob import TextBlob
# Analyze sentiment
text = "I love this product! It's amazing."
blob = TextBlob(text)
# Get sentiment polarity and subjectivity
polarity = blob.sentiment.polarity
subjectivity = blob.sentiment.subjectivity
print(f"Polarity: {polarity:.4f}") # Range: -1 (negative) to 1 (positive)
print(f"Subjectivity: {subjectivity:.4f}") # Range: 0 (objective) to 1 (subjective)
if polarity > 0:
print("Positive sentiment")
elif polarity < 0:
print("Negative sentiment")
else:
print("Neutral sentiment")
from transformers import pipeline
# Create sentiment analysis pipeline
classifier = pipeline("sentiment-analysis")
# Analyze multiple texts
texts = [
"I love this product! It's amazing.",
"This is the worst thing I've ever bought.",
"It's okay, nothing special."
]
results = classifier(texts)
for text, result in zip(texts, results):
print(f"Text: {text}")
print(f"Sentiment: {result['label']}, Score: {result['score']:.4f}\n")
Additional Resources
Learning Resources
- Books: "Speech and Language Processing" by Jurafsky & Martin
- Courses: Stanford CS224N, Fast.ai NLP, Coursera NLP Specialization
- Documentation: Hugging Face, spaCy, NLTK, Gensim
- Research Papers: BERT, GPT, Transformer, Attention Is All You Need
- Communities: Hugging Face, Papers with Code, Reddit ML/NLP
Useful Libraries
- Core NLP: NLTK, spaCy, Stanford CoreNLP
- Deep Learning: PyTorch, TensorFlow, Keras
- Transformers: Hugging Face Transformers
- Word Embeddings: Gensim, FastText
- Utilities: TextBlob, Pattern, Scikit-learn
Comprehensive NLP Concepts & Cheatsheet Reference
This NLP Concepts & cheatsheet on Nikhil Learn Hub collects syntax, commands, and practical snippets for quick revision. Explore NLP concepts, text processing, tokenization, embeddings, and machine learning techniques with simple examples.
Use the reference cards and examples above during coding sessions; return here instead of scattered searches when you need dependable reminders. Follow the Natural language processing learning roadmap when you want structured lessons beyond one-page lookups.
Quick lookup coverage
- Syntax, commands, and API signatures
- Copy-ready examples and common patterns
- Terminology for coursework and interviews
- Cross-links to the matching learning roadmap
How to study with this sheet
- Production debugging and tuning reminders
- Security, performance, or scale cautions
- Integration with adjacent stacks on this site
- Deeper study through tutorials and roadmaps
Who Should Use This Cheatsheet
Students, self-taught developers, and professionals who need fast NLP Concepts & lookups during labs, debugging, or interview revision should keep this page bookmarked.
Related Resources on Nikhil Learn Hub
- Natural language processing learning roadmapstructured learning path for the same technology
- Cheatsheets hubbrowse all quick-reference sheets
- Technology hubtutorials, roadmaps, and practice hubs