Python Basics
Data Structures
my_list = [1, 2, 3, 'apple', 'banana']
my_list.append('orange') # Add element
my_list[0] # Access first element
# Dictionaries
my_dict = {'name': 'John', 'age': 30, 'city': 'New York'}
my_dict['age'] # Access value
my_dict['occupation'] = 'Engineer' # Add new key-value pair
# Tuples (immutable)
my_tuple = (1, 2, 3, 'apple')
# Sets (unique elements)
my_set = {1, 2, 3, 3, 4} # Results in {1, 2, 3, 4}
# NumPy Arrays
import numpy as np
arr = np.array([1, 2, 3, 4, 5])
arr.shape # Get dimensions
Control Flow
x = 10
if x > 10:
print("Greater than 10")
elif x == 10:
print("Equal to 10")
else:
print("Less than 10")
# For loops
for i in range(5):
print(i)
# List comprehension
squares = [x**2 for x in range(10)]
# While loop
count = 0
while count < 5:
print(count)
count += 1
# Functions
def add_numbers(a, b):
return a + b
result = add_numbers(5, 3)
Pandas Data Manipulation
DataFrames Basics
# Creating DataFrame
df = pd.DataFrame({
'Name': ['John', 'Anna', 'Peter', 'Linda'],
'Age': [28, 24, 35, 32],
'City': ['New York', 'Paris', 'Berlin', 'London']
})
# Reading data
df = pd.read_csv('data.csv') # From CSV
df = pd.read_excel('data.xlsx') # From Excel
# Basic operations
df.head() # First 5 rows
df.info() # DataFrame info
df.describe() # Statistical summary
df.shape # (rows, columns)
df.columns # Column names
Data Selection
df['Name'] # Single column
df[['Name', 'Age']] # Multiple columns
# Selecting rows
df.iloc[0] # First row by index
df.loc[0] # First row by label
df[df['Age'] > 30] # Filter rows
# Boolean indexing
df[(df['Age'] > 30) & (df['City'] == 'London')]
# Setting values
df.loc[0, 'Age'] = 29 # Set specific value
# Adding new column
df['Senior'] = df['Age'] > 30
# Dropping columns
df.drop('City', axis=1, inplace=True)
Data Cleaning
df.isnull().sum() # Count missing values
df.dropna() # Drop rows with missing values
df.fillna(0) # Fill missing values with 0
df.fillna(df.mean()) # Fill with mean
# Removing duplicates
df.drop_duplicates()
# Data type conversion
df['Age'] = df['Age'].astype('int')
# String operations
df['Name'] = df['Name'].str.upper()
df['Name'] = df['Name'].str.replace(' ', '_')
# Renaming columns
df.rename(columns={'Name': 'Full_Name'}, inplace=True)
# Resetting index
df.reset_index(drop=True, inplace=True)
Grouping & Aggregation
grouped = df.groupby('City')
grouped['Age'].mean() # Mean age by city
# Multiple aggregations
df.groupby('City').agg({
'Age': ['mean', 'min', 'max', 'count'],
'Salary': 'sum'
})
# Pivot tables
pd.pivot_table(df, values='Age', index='City',
columns='Senior', aggfunc='mean')
# Sorting values
df.sort_values('Age', ascending=False)
# Value counts
df['City'].value_counts()
# Applying functions
df['Age'].apply(lambda x: x * 12) # Convert to months
Data Visualization
Matplotlib
# Line plot
plt.plot(x, y)
plt.title('Line Plot')
plt.xlabel('X Axis')
plt.ylabel('Y Axis')
plt.show()
# Scatter plot
plt.scatter(x, y)
plt.title('Scatter Plot')
plt.show()
# Histogram
plt.hist(data, bins=30)
plt.title('Histogram')
plt.show()
# Bar chart
plt.bar(categories, values)
plt.title('Bar Chart')
plt.show()
# Subplots
fig, axes = plt.subplots(2, 2, figsize=(10, 8))
axes[0, 0].plot(x, y)
axes[0, 1].scatter(x, y)
axes[1, 0].hist(data)
axes[1, 1].bar(categories, values)
plt.tight_layout()
plt.show()
Seaborn
# Set style
sns.set_style('whitegrid')
# Distribution plot
sns.histplot(data=df, x='Age', kde=True)
plt.show()
# Box plot
sns.boxplot(data=df, x='City', y='Age')
plt.show()
# Violin plot
sns.violinplot(data=df, x='City', y='Age')
plt.show()
# Count plot
sns.countplot(data=df, x='City')
plt.show()
# Heatmap
corr = df.corr()
sns.heatmap(corr, annot=True, cmap='coolwarm')
plt.show()
# Pair plot
sns.pairplot(df, hue='City')
plt.show()
Machine Learning
Scikit-learn Basics
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
# Prepare data
X = df.drop('target', axis=1)
y = df['target']
# Split data
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, random_state=42
)
# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
# Train model
model = LinearRegression()
model.fit(X_train_scaled, y_train)
# Predict
y_pred = model.predict(X_test_scaled)
# Evaluate
mse = mean_squared_error(y_test, y_pred)
print(f'Mean Squared Error: {mse}')
Classification
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix
# Initialize models
models = {
'Logistic Regression': LogisticRegression(),
'Decision Tree': DecisionTreeClassifier(),
'Random Forest': RandomForestClassifier(),
'SVM': SVC()
}
# Train and evaluate each model
for name, model in models.items():
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print(f'{name} Results:')
print(classification_report(y_test, y_pred))
print('Confusion Matrix:')
print(confusion_matrix(y_test, y_pred))
print('\\n' + '='*50 + '\\n')
# Cross-validation
from sklearn.model_selection import cross_val_score
scores = cross_val_score(model, X, y, cv=5)
print(f'Cross-validation scores: {scores}')
print(f'Average score: {scores.mean()}')
Model Evaluation
accuracy_score, precision_score, recall_score,
f1_score, roc_auc_score, roc_curve
)
# For classification models
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')
print(f'Precision: {precision:.2f}')
print(f'Recall: {recall:.2f}')
print(f'F1 Score: {f1:.2f}')
print(f'ROC AUC: {roc_auc:.2f}')
# ROC Curve
fpr, tpr, thresholds = roc_curve(y_test, y_pred_prob)
plt.plot(fpr, tpr)
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.show()
# For regression models
from sklearn.metrics import (
mean_absolute_error, mean_squared_error, r2_score
)
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)
Hyperparameter Tuning
# Define parameter grid
param_grid = {
'n_estimators': [100, 200, 300],
'max_depth': [None, 5, 10],
'min_samples_split': [2, 5, 10]
}
# Initialize model
model = RandomForestClassifier()
# Grid search
grid_search = GridSearchCV(
estimator=model,
param_grid=param_grid,
cv=5,
scoring='accuracy'
)
grid_search.fit(X_train, y_train)
# Best parameters and score
print(f'Best parameters: {grid_search.best_params_}')
print(f'Best score: {grid_search.best_score_}')
# Randomized search (faster for large parameter spaces)
random_search = RandomizedSearchCV(
estimator=model,
param_distributions=param_grid,
n_iter=10,
cv=5,
scoring='accuracy',
random_state=42
)
random_search.fit(X_train, y_train)
Statistics
Descriptive Statistics
import scipy.stats as stats
# Measures of central tendency
mean = np.mean(data)
median = np.median(data)
mode = stats.mode(data)
# Measures of dispersion
variance = np.var(data)
std_dev = np.std(data)
range_val = np.ptp(data) # Peak to peak (max - min)
iqr = stats.iqr(data) # Interquartile range
# Percentiles and quantiles
q1 = np.percentile(data, 25) # First quartile
q3 = np.percentile(data, 75) # Third quartile
# Shape of distribution
skewness = stats.skew(data)
kurtosis = stats.kurtosis(data)
# Correlation
correlation = np.corrcoef(x, y)[0, 1]
# Covariance
covariance = np.cov(x, y)[0, 1]
Inferential Statistics
# T-test (compare means)
t_stat, p_value = stats.ttest_ind(group1, group2)
# Chi-square test (categorical data)
from scipy.stats import chi2_contingency
chi2, p, dof, expected = chi2_contingency(contingency_table)
# ANOVA (compare means across multiple groups)
f_stat, p_value = stats.f_oneway(group1, group2, group3)
# Confidence intervals
ci = stats.norm.interval(0.95, loc=mean, scale=std_err)
# Normal distribution
z_score = (x - mean) / std_dev
# Probability density function
pdf = stats.norm.pdf(x, mean, std_dev)
# Cumulative distribution function
cdf = stats.norm.cdf(x, mean, std_dev)
# Sampling
sample = np.random.choice(data, size=100, replace=False)
# Bootstrap sampling
bootstrap_means = [np.mean(np.random.choice(data, size=len(data)))
for _ in range(1000)]
Quick Reference
Common Libraries
import pandas as pd
import numpy as np
# Data visualization
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
# Machine learning
import sklearn
import tensorflow as tf
import torch
import xgboost as xgb
# Statistics
import scipy.stats as stats
import statsmodels.api as sm
# Natural language processing
import nltk
import spacy
import gensim
# Web scraping
import requests
import beautifulsoup4 as bs4
import scrapy
# Database access
import sqlite3
import pymysql
import psycopg2
import sqlalchemy
Useful Resources
- Online Courses: Coursera, edX, Udacity, DataCamp
- Books: "Python for Data Analysis", "Introduction to Statistical Learning"
- YouTube Channels: StatQuest, Corey Schafer, Krish Naik
- Communities: Stack Overflow, Kaggle, Reddit (r/datascience)
- Practice Platforms: Kaggle, HackerRank, LeetCode
- Blogs: Towards Data Science, KDnuggets, Analytics Vidhya
- Documentation: Official docs for pandas, NumPy, scikit-learn
Comprehensive Data Science Concepts Cheatsheet Reference
This Data Science Concepts cheatsheet on Nikhil Learn Hub collects syntax, commands, and practical snippets for quick revision. Explore data science workflows, statistics, visualization, machine learning, and data analysis techniques with examples.
Use the reference cards and examples above during coding sessions; return here instead of scattered searches when you need dependable reminders. Follow the Data science learning roadmap when you want structured lessons beyond one-page lookups.
Quick lookup coverage
- Syntax, commands, and API signatures
- Copy-ready examples and common patterns
- Terminology for coursework and interviews
- Cross-links to the matching learning roadmap
How to study with this sheet
- Production debugging and tuning reminders
- Security, performance, or scale cautions
- Integration with adjacent stacks on this site
- Deeper study through tutorials and roadmaps
Who Should Use This Cheatsheet
Students, self-taught developers, and professionals who need fast Data Science Concepts lookups during labs, debugging, or interview revision should keep this page bookmarked.
Related Resources on Nikhil Learn Hub
- Data science learning roadmapstructured learning path for the same technology
- Cheatsheets hubbrowse all quick-reference sheets
- Technology hubtutorials, roadmaps, and practice hubs