Data Science Concepts Cheatsheet

Python Basics

Data Structures

# Lists

my_list = [1, 2, 3, 'apple', 'banana']

my_list.append('orange')  # Add element

my_list[0]  # Access first element

# Dictionaries

my_dict = {'name': 'John', 'age': 30, 'city': 'New York'}

my_dict['age']  # Access value

my_dict['occupation'] = 'Engineer'  # Add new key-value pair

# Tuples (immutable)

my_tuple = (1, 2, 3, 'apple')

# Sets (unique elements)

my_set = {1, 2, 3, 3, 4}  # Results in {1, 2, 3, 4}

# NumPy Arrays

import numpy as np

arr = np.array([1, 2, 3, 4, 5])

arr.shape  # Get dimensions

Control Flow

# If-elif-else statements

x = 10

if x > 10:

    print("Greater than 10")

elif x == 10:

    print("Equal to 10")

else:

    print("Less than 10")

# For loops

for i in range(5):

    print(i)

# List comprehension

squares = [x**2 for x in range(10)]

# While loop

count = 0

while count < 5:

    print(count)

    count += 1

# Functions

def add_numbers(a, b):

    return a + b

result = add_numbers(5, 3)

Pandas Data Manipulation

DataFrames Basics

import pandas as pd

# Creating DataFrame

df = pd.DataFrame({

    'Name': ['John', 'Anna', 'Peter', 'Linda'],

    'Age': [28, 24, 35, 32],

    'City': ['New York', 'Paris', 'Berlin', 'London']

})

# Reading data

df = pd.read_csv('data.csv')  # From CSV

df = pd.read_excel('data.xlsx')  # From Excel

# Basic operations

df.head()  # First 5 rows

df.info()  # DataFrame info

df.describe()  # Statistical summary

df.shape  # (rows, columns)

df.columns  # Column names

Data Selection

# Selecting columns

df['Name']  # Single column

df[['Name', 'Age']]  # Multiple columns

# Selecting rows

df.iloc[0]  # First row by index

df.loc[0]  # First row by label

df[df['Age'] > 30]  # Filter rows

# Boolean indexing

df[(df['Age'] > 30) & (df['City'] == 'London')]

# Setting values

df.loc[0, 'Age'] = 29  # Set specific value

# Adding new column

df['Senior'] = df['Age'] > 30

# Dropping columns

df.drop('City', axis=1, inplace=True)

Data Cleaning

# Handling missing values

df.isnull().sum()  # Count missing values

df.dropna()  # Drop rows with missing values

df.fillna(0)  # Fill missing values with 0

df.fillna(df.mean())  # Fill with mean

# Removing duplicates

df.drop_duplicates()

# Data type conversion

df['Age'] = df['Age'].astype('int')

# String operations

df['Name'] = df['Name'].str.upper()

df['Name'] = df['Name'].str.replace(' ', '_')

# Renaming columns

df.rename(columns={'Name': 'Full_Name'}, inplace=True)

# Resetting index

df.reset_index(drop=True, inplace=True)

Grouping & Aggregation

# Group by operations

grouped = df.groupby('City')

grouped['Age'].mean()  # Mean age by city

# Multiple aggregations

df.groupby('City').agg({

    'Age': ['mean', 'min', 'max', 'count'],

    'Salary': 'sum'

})

# Pivot tables

pd.pivot_table(df, values='Age', index='City', 

    columns='Senior', aggfunc='mean')

# Sorting values

df.sort_values('Age', ascending=False)

# Value counts

df['City'].value_counts()

# Applying functions

df['Age'].apply(lambda x: x * 12)  # Convert to months

Data Visualization

Matplotlib

import matplotlib.pyplot as plt

# Line plot

plt.plot(x, y)

plt.title('Line Plot')

plt.xlabel('X Axis')

plt.ylabel('Y Axis')

plt.show()

# Scatter plot

plt.scatter(x, y)

plt.title('Scatter Plot')

plt.show()

# Histogram

plt.hist(data, bins=30)

plt.title('Histogram')

plt.show()

# Bar chart

plt.bar(categories, values)

plt.title('Bar Chart')

plt.show()

# Subplots

fig, axes = plt.subplots(2, 2, figsize=(10, 8))

axes[0, 0].plot(x, y)

axes[0, 1].scatter(x, y)

axes[1, 0].hist(data)

axes[1, 1].bar(categories, values)

plt.tight_layout()

plt.show()

Seaborn

import seaborn as sns

# Set style

sns.set_style('whitegrid')

# Distribution plot

sns.histplot(data=df, x='Age', kde=True)

plt.show()

# Box plot

sns.boxplot(data=df, x='City', y='Age')

plt.show()

# Violin plot

sns.violinplot(data=df, x='City', y='Age')

plt.show()

# Count plot

sns.countplot(data=df, x='City')

plt.show()

# Heatmap

corr = df.corr()

sns.heatmap(corr, annot=True, cmap='coolwarm')

plt.show()

# Pair plot

sns.pairplot(df, hue='City')

plt.show()

Machine Learning

Scikit-learn Basics

from sklearn.model_selection import train_test_split

from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import LinearRegression

from sklearn.metrics import mean_squared_error

# Prepare data

X = df.drop('target', axis=1)

y = df['target']

# Split data

X_train, X_test, y_train, y_test = train_test_split(

    X, y, test_size=0.2, random_state=42

)

# Scale features

scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)

X_test_scaled = scaler.transform(X_test)

# Train model

model = LinearRegression()

model.fit(X_train_scaled, y_train)

# Predict

y_pred = model.predict(X_test_scaled)

# Evaluate

mse = mean_squared_error(y_test, y_pred)

print(f'Mean Squared Error: {mse}')

Classification

from sklearn.linear_model import LogisticRegression

from sklearn.tree import DecisionTreeClassifier

from sklearn.ensemble import RandomForestClassifier

from sklearn.svm import SVC

from sklearn.metrics import classification_report, confusion_matrix

# Initialize models

models = {

    'Logistic Regression': LogisticRegression(),

    'Decision Tree': DecisionTreeClassifier(),

    'Random Forest': RandomForestClassifier(),

    'SVM': SVC()

}

# Train and evaluate each model

for name, model in models.items():

    model.fit(X_train, y_train)

    y_pred = model.predict(X_test)

    print(f'{name} Results:')

    print(classification_report(y_test, y_pred))

    print('Confusion Matrix:')

    print(confusion_matrix(y_test, y_pred))

    print('\\n' + '='*50 + '\\n')

# Cross-validation

from sklearn.model_selection import cross_val_score

scores = cross_val_score(model, X, y, cv=5)

print(f'Cross-validation scores: {scores}')

print(f'Average score: {scores.mean()}')

Model Evaluation

from sklearn.metrics import (

    accuracy_score, precision_score, recall_score, 

    f1_score, roc_auc_score, roc_curve

)

# For classification models

accuracy = accuracy_score(y_test, y_pred)

precision = precision_score(y_test, y_pred)

recall = recall_score(y_test, y_pred)

f1 = f1_score(y_test, y_pred)

roc_auc = roc_auc_score(y_test, y_pred)

print(f'Accuracy: {accuracy:.2f}')

print(f'Precision: {precision:.2f}')

print(f'Recall: {recall:.2f}')

print(f'F1 Score: {f1:.2f}')

print(f'ROC AUC: {roc_auc:.2f}')

# ROC Curve

fpr, tpr, thresholds = roc_curve(y_test, y_pred_prob)

plt.plot(fpr, tpr)

plt.xlabel('False Positive Rate')

plt.ylabel('True Positive Rate')

plt.title('ROC Curve')

plt.show()

# For regression models

from sklearn.metrics import (

    mean_absolute_error, mean_squared_error, r2_score

)

mae = mean_absolute_error(y_test, y_pred)

mse = mean_squared_error(y_test, y_pred)

rmse = np.sqrt(mse)

r2 = r2_score(y_test, y_pred)

Hyperparameter Tuning

from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

# Define parameter grid

param_grid = {

    'n_estimators': [100, 200, 300],

    'max_depth': [None, 5, 10],

    'min_samples_split': [2, 5, 10]

}

# Initialize model

model = RandomForestClassifier()

# Grid search

grid_search = GridSearchCV(

    estimator=model, 

    param_grid=param_grid, 

    cv=5, 

    scoring='accuracy'

)

grid_search.fit(X_train, y_train)

# Best parameters and score

print(f'Best parameters: {grid_search.best_params_}')

print(f'Best score: {grid_search.best_score_}')

# Randomized search (faster for large parameter spaces)

random_search = RandomizedSearchCV(

    estimator=model, 

    param_distributions=param_grid, 

    n_iter=10, 

    cv=5, 

    scoring='accuracy',

    random_state=42

)

random_search.fit(X_train, y_train)

Statistics

Descriptive Statistics

import numpy as np

import scipy.stats as stats

# Measures of central tendency

mean = np.mean(data)

median = np.median(data)

mode = stats.mode(data)

# Measures of dispersion

variance = np.var(data)

std_dev = np.std(data)

range_val = np.ptp(data)  # Peak to peak (max - min)

iqr = stats.iqr(data)  # Interquartile range

# Percentiles and quantiles

q1 = np.percentile(data, 25)  # First quartile

q3 = np.percentile(data, 75)  # Third quartile

# Shape of distribution

skewness = stats.skew(data)

kurtosis = stats.kurtosis(data)

# Correlation

correlation = np.corrcoef(x, y)[0, 1]

# Covariance

covariance = np.cov(x, y)[0, 1]

Inferential Statistics

# Hypothesis testing

# T-test (compare means)

t_stat, p_value = stats.ttest_ind(group1, group2)

# Chi-square test (categorical data)

from scipy.stats import chi2_contingency

chi2, p, dof, expected = chi2_contingency(contingency_table)

# ANOVA (compare means across multiple groups)

f_stat, p_value = stats.f_oneway(group1, group2, group3)

# Confidence intervals

ci = stats.norm.interval(0.95, loc=mean, scale=std_err)

# Normal distribution

z_score = (x - mean) / std_dev

# Probability density function

pdf = stats.norm.pdf(x, mean, std_dev)

# Cumulative distribution function

cdf = stats.norm.cdf(x, mean, std_dev)

# Sampling

sample = np.random.choice(data, size=100, replace=False)

# Bootstrap sampling

bootstrap_means = [np.mean(np.random.choice(data, size=len(data))) 

    for _ in range(1000)]

Quick Reference

Common Libraries

# Data manipulation

import pandas as pd

import numpy as np

# Data visualization

import matplotlib.pyplot as plt

import seaborn as sns

import plotly.express as px

# Machine learning

import sklearn

import tensorflow as tf

import torch

import xgboost as xgb

# Statistics

import scipy.stats as stats

import statsmodels.api as sm

# Natural language processing

import nltk

import spacy

import gensim

# Web scraping

import requests

import beautifulsoup4 as bs4

import scrapy

# Database access

import sqlite3

import pymysql

import psycopg2

import sqlalchemy

Useful Resources

Online Courses: Coursera, edX, Udacity, DataCamp
Books: "Python for Data Analysis", "Introduction to Statistical Learning"
YouTube Channels: StatQuest, Corey Schafer, Krish Naik
Communities: Stack Overflow, Kaggle, Reddit (r/datascience)
Practice Platforms: Kaggle, HackerRank, LeetCode
Blogs: Towards Data Science, KDnuggets, Analytics Vidhya
Documentation: Official docs for pandas, NumPy, scikit-learn

Quick reference guide

Comprehensive Data Science Concepts Cheatsheet Reference

This Data Science Concepts cheatsheet on Nikhil Learn Hub collects syntax, commands, and practical snippets for quick revision. Explore data science workflows, statistics, visualization, machine learning, and data analysis techniques with examples.

Use the reference cards and examples above during coding sessions; return here instead of scattered searches when you need dependable reminders. Follow the Data science learning roadmap when you want structured lessons beyond one-page lookups.

Quick lookup coverage

Syntax, commands, and API signatures
Copy-ready examples and common patterns
Terminology for coursework and interviews
Cross-links to the matching learning roadmap

How to study with this sheet

Production debugging and tuning reminders
Security, performance, or scale cautions
Integration with adjacent stacks on this site
Deeper study through tutorials and roadmaps

Who Should Use This Cheatsheet

Students, self-taught developers, and professionals who need fast Data Science Concepts lookups during labs, debugging, or interview revision should keep this page bookmarked.

Related Resources on Nikhil Learn Hub

Data science learning roadmapstructured learning path for the same technology
Cheatsheets hubbrowse all quick-reference sheets
Technology hubtutorials, roadmaps, and practice hubs

Related Cheatsheet Links

Data Science Cheatsheet