NLP Cheatsheet | Nikhil Learn Hub Guide Pro Hub AI

Text Preprocessing

Basic Text Cleaning

# Import libraries

import re

import string

import nltk

from nltk.corpus import stopwords

from nltk.tokenize import word_tokenize

from nltk.stem import PorterStemmer, WordNetLemmatizer

# Download required NLTK data

nltk.download('punkt')

nltk.download('stopwords')

nltk.download('wordnet')

# Basic text cleaning function

def clean_text(text):

    # Convert to lowercase

    text = text.lower()

    # Remove punctuation

    text = text.translate(str.maketrans('', '', string.punctuation))

    # Remove numbers

    text = re.sub(r'\d+', '', text)

    # Remove extra whitespace

    text = ' '.join(text.split())

    return text

Tokenization & Normalization

# Tokenization

text = "Natural Language Processing is amazing!"

# Word tokenization

tokens = word_tokenize(text)

print(tokens)

# Output: ['Natural', 'Language', 'Processing', 'is', 'amazing', '!']

# Sentence tokenization

from nltk.tokenize import sent_tokenize

sentences = sent_tokenize("This is sentence one. This is sentence two.")

print(sentences)

# Stopwords removal

stop_words = set(stopwords.words('english'))

filtered_tokens = [word for word in tokens if word.lower() not in stop_words]

# Stemming and Lemmatization

stemmer = PorterStemmer()

lemmatizer = WordNetLemmatizer()

words = ["running", "ran", "runs", "easily", "fairly"]

# Stemming

stems = [stemmer.stem(word) for word in words]

print(stems)

# Output: ['run', 'ran', 'run', 'easili', 'fairli']

# Lemmatization

lemmas = [lemmatizer.lemmatize(word, pos='v') for word in words]

print(lemmas)

# Output: ['run', 'run', 'run', 'easily', 'fairly']

Feature Extraction

Bag of Words & TF-IDF

# Using scikit-learn

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

# Sample documents

documents = [

    "This is the first document.",

    "This document is the second document.",

    "And this is the third one.",

    "Is this the first document?"

]

# Bag of Words

vectorizer = CountVectorizer()

X = vectorizer.fit_transform(documents)

print(vectorizer.get_feature_names_out())

print(X.toarray())

# TF-IDF

tfidf_vectorizer = TfidfVectorizer()

X_tfidf = tfidf_vectorizer.fit_transform(documents)

print(X_tfidf.toarray())

TF-IDF Formula

TF(t,d) = (Number of times term t appears in document d) / (Total number of terms in document d)

IDF(t) = log(Total number of documents / Number of documents containing term t)

TF-IDF(t,d) = TF(t,d) × IDF(t)

Word Embeddings

# Using pre-trained Word2Vec

import gensim.downloader as api

# Load pre-trained model

word_vectors = api.load("word2vec-google-news-300")

# Get vector for a word

vector = word_vectors["king"]

print(f"Vector dimension: {vector.shape}")

# Find similar words

similar_words = word_vectors.most_similar("king", topn=5)

for word, score in similar_words:

    print(f"{word}: {score:.4f}")

# Word analogy: king - man + woman = ?

result = word_vectors.most_similar(positive=["woman", "king"], negative=["man"])

print(result[0])

# Training custom Word2Vec model

from gensim.models import Word2Vec

# Tokenized sentences

sentences = [["this", "is", "sentence", "one"],

             ["this", "is", "sentence", "two"]]

# Train model

model = Word2Vec(sentences, vector_size=100, window=5, min_count=1, workers=4)

# Save and load model

model.save("word2vec.model")

model = Word2Vec.load("word2vec.model")

NLP Models

Traditional ML Models

# Text classification with Naive Bayes

from sklearn.naive_bayes import MultinomialNB

from sklearn.model_selection import train_test_split

from sklearn.metrics import accuracy_score, classification_report

# Prepare data

vectorizer = TfidfVectorizer(max_features=1000)

X = vectorizer.fit_transform(documents)

y = [0, 1, 0, 1]  # Example labels

# Split data

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train model

model = MultinomialNB()

model.fit(X_train, y_train)

# Predict and evaluate

y_pred = model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)

print(f"Accuracy: {accuracy:.4f}")

print(classification_report(y_test, y_pred))

Common ML Models for NLP

Naive Bayes - Fast, works well with high-dimensional data

Logistic Regression - Good baseline, interpretable

Support Vector Machines - Effective for text classification

Random Forests - Robust, handles non-linear relationships

Deep Learning Models

# Simple LSTM for text classification

import torch

import torch.nn as nn

import torch.optim as optim

class TextLSTM(nn.Module):

    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, n_layers, dropout):

        super().__init__()

        self.embedding = nn.Embedding(vocab_size, embedding_dim)

        self.lstm = nn.LSTM(embedding_dim, hidden_dim, n_layers, 

                           batch_first=True, dropout=dropout)

        self.fc = nn.Linear(hidden_dim, output_dim)

        self.dropout = nn.Dropout(dropout)

    def forward(self, text):

        embedded = self.embedding(text)

        output, (hidden, cell) = self.lstm(embedded)

        hidden = self.dropout(hidden[-1])

        return self.fc(hidden)

# Model parameters

VOCAB_SIZE = 10000

EMBEDDING_DIM = 100

HIDDEN_DIM = 256

OUTPUT_DIM = 2

N_LAYERS = 2

DROPOUT = 0.5

model = TextLSTM(VOCAB_SIZE, EMBEDDING_DIM, HIDDEN_DIM, OUTPUT_DIM, N_LAYERS, DROPOUT)

Transformers

BERT Implementation

# Using Hugging Face Transformers

from transformers import BertTokenizer, BertModel, BertForSequenceClassification

import torch

# Load pre-trained BERT model and tokenizer

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

# Tokenize text

text = "This is a sample text for classification."

inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)

# Get model outputs

outputs = model(**inputs)

logits = outputs.logits

# Get predictions

predictions = torch.argmax(logits, dim=-1)

print(f"Predicted class: {predictions.item()}")

# Fine-tuning BERT for custom task

from transformers import Trainer, TrainingArguments

# Define training arguments

training_args = TrainingArguments(

    output_dir='./results',

    num_train_epochs=3,

    per_device_train_batch_size=16,

    per_device_eval_batch_size=64,

    warmup_steps=500,

    weight_decay=0.01,

    logging_dir='./logs',

    logging_steps=10,

)

# Create trainer

trainer = Trainer(

    model=model,

    args=training_args,

    train_dataset=train_dataset,

    eval_dataset=eval_dataset,

)

# Fine-tune model

trainer.train()

GPT & Other Models

# Using GPT-2 for text generation

from transformers import GPT2Tokenizer, GPT2LMHeadModel

# Load pre-trained GPT-2

tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

model = GPT2LMHeadModel.from_pretrained('gpt2')

# Generate text

input_text = "The future of artificial intelligence"

input_ids = tokenizer.encode(input_text, return_tensors='pt')

# Generate text with various parameters

output = model.generate(

    input_ids,

    max_length=100,

    num_return_sequences=1,

    temperature=0.7,

    do_sample=True,

    pad_token_id=tokenizer.eos_token_id

)

generated_text = tokenizer.decode(output[0], skip_special_tokens=True)

print(generated_text)

Popular Transformer Models

BERT - Bidirectional Encoder Representations (Google)

GPT - Generative Pre-trained Transformer (OpenAI)

RoBERTa - Robustly Optimized BERT (Facebook)

DistilBERT - Smaller, faster version of BERT

T5 - Text-to-Text Transfer Transformer (Google)

Advanced NLP Tasks

Named Entity Recognition

# Using spaCy for NER

import spacy

# Load English model

nlp = spacy.load("en_core_web_sm")

# Process text

text = "Apple is looking at buying U.K. startup for $1 billion. Tim Cook is the CEO."

doc = nlp(text)

# Extract entities

for ent in doc.ents:

    print(ent.text, ent.label_, spacy.explain(ent.label_))

# Output:

# Apple ORG Companies, agencies, institutions, etc.

# U.K. GPE Countries, cities, states

# $1 billion MONEY Monetary values, including unit

# Tim Cook PERSON People, including fictional

# Using Transformers for NER

from transformers import AutoTokenizer, AutoModelForTokenClassification

from transformers import pipeline

# Load NER pipeline

ner_pipeline = pipeline("ner", model="dbmdz/bert-large-cased-finetuned-conll03-english")

# Perform NER

results = ner_pipeline("Hugging Face is a company based in New York City.")

for entity in results:

    print(f"{entity['word']}: {entity['entity']} (confidence: {entity['score']:.4f})")

Sentiment Analysis

# Using TextBlob for simple sentiment analysis

from textblob import TextBlob

# Analyze sentiment

text = "I love this product! It's amazing."

blob = TextBlob(text)

# Get sentiment polarity and subjectivity

polarity = blob.sentiment.polarity

subjectivity = blob.sentiment.subjectivity

print(f"Polarity: {polarity:.4f}")  # Range: -1 (negative) to 1 (positive)

print(f"Subjectivity: {subjectivity:.4f}")  # Range: 0 (objective) to 1 (subjective)

if polarity > 0:

    print("Positive sentiment")

elif polarity < 0:

    print("Negative sentiment")

else:

    print("Neutral sentiment")

# Using Transformers for advanced sentiment analysis

from transformers import pipeline

# Create sentiment analysis pipeline

classifier = pipeline("sentiment-analysis")

# Analyze multiple texts

texts = [

    "I love this product! It's amazing.",

    "This is the worst thing I've ever bought.",

    "It's okay, nothing special."

]

results = classifier(texts)

for text, result in zip(texts, results):

    print(f"Text: {text}")

    print(f"Sentiment: {result['label']}, Score: {result['score']:.4f}\n")

Additional Resources

Learning Resources

Books: "Speech and Language Processing" by Jurafsky & Martin
Courses: Stanford CS224N, Fast.ai NLP, Coursera NLP Specialization
Documentation: Hugging Face, spaCy, NLTK, Gensim
Research Papers: BERT, GPT, Transformer, Attention Is All You Need
Communities: Hugging Face, Papers with Code, Reddit ML/NLP

Useful Libraries

Core NLP: NLTK, spaCy, Stanford CoreNLP
Deep Learning: PyTorch, TensorFlow, Keras
Transformers: Hugging Face Transformers
Word Embeddings: Gensim, FastText
Utilities: TextBlob, Pattern, Scikit-learn