Master data analysis, visualization, and machine learning algorithms to extract insights from data and build predictive models.
Transform raw data into actionable intelligence and build AI-powered solutions for real-world problems.
Prerequisites: Python basics, basic statistics and mathematics, curiosity about data-driven decision making
NumPy is the foundation of scientific computing in Python. Learn to work with multidimensional arrays efficiently.
import numpy as np
# Creating arrays
arr1 = np.array([1, 2, 3, 4, 5])
arr2 = np.arange(0, 10, 2) # [0, 2, 4, 6, 8]
arr3 = np.linspace(0, 1, 5) # [0. , 0.25, 0.5 , 0.75, 1. ]
# 2D arrays (matrices)
matrix = np.array([[1, 2, 3],
[4, 5, 6],
[7, 8, 9]])
print(f"Shape: {matrix.shape}") # (3, 3)
print(f"Mean: {matrix.mean()}") # 5.0
print(f"Std: {matrix.std()}") # 2.581988897471611
# Array operations
arr = np.array([1, 2, 3, 4, 5])
print(arr * 2) # [2, 4, 6, 8, 10]
print(arr ** 2) # [1, 4, 9, 16, 25]
print(np.sqrt(arr)) # [1., 1.414, 1.732, 2., 2.236]
# Boolean indexing
arr = np.array([10, 15, 20, 25, 30])
mask = arr > 20
print(arr[mask]) # [25, 30]
# Statistical operations
data = np.random.randn(1000) # 1000 random numbers from normal distribution
print(f"Mean: {data.mean():.3f}")
print(f"Std: {data.std():.3f}")
print(f"Median: {np.median(data):.3f}")
print(f"25th percentile: {np.percentile(data, 25):.3f}")
print(f"75th percentile: {np.percentile(data, 75):.3f}")
Pandas provides powerful data structures for data analysis. Master DataFrames for handling structured data efficiently.
import pandas as pd
import numpy as np
# Creating a DataFrame
data = {
'Name': ['Alice', 'Bob', 'Charlie', 'David', 'Eve'],
'Age': [25, 30, 35, 28, 32],
'Salary': [50000, 60000, 75000, 55000, 70000],
'Department': ['IT', 'HR', 'IT', 'Sales', 'IT']
}
df = pd.DataFrame(data)
# Basic exploration
print(df.head())
print(df.info())
print(df.describe())
# Filtering data
it_employees = df[df['Department'] == 'IT']
high_earners = df[df['Salary'] > 60000]
# Groupby operations
avg_salary_by_dept = df.groupby('Department')['Salary'].mean()
# IT: 65000, HR: 60000, Sales: 55000
# Adding new columns
df['Annual_Bonus'] = df['Salary'] * 0.10
df['Experience_Level'] = df['Age'].apply(
lambda x: 'Senior' if x > 30 else 'Junior'
)
# Handling missing data
df_with_nulls = df.copy()
df_with_nulls.loc[2, 'Salary'] = np.nan
# Fill missing values
df_filled = df_with_nulls.fillna(df_with_nulls['Salary'].mean())
# Drop missing values
df_dropped = df_with_nulls.dropna()
# Reading data from files
# df = pd.read_csv('data.csv')
# df = pd.read_excel('data.xlsx')
# df = pd.read_json('data.json')
# Merging DataFrames
df1 = pd.DataFrame({'ID': [1, 2, 3], 'Name': ['A', 'B', 'C']})
df2 = pd.DataFrame({'ID': [1, 2, 4], 'Score': [90, 85, 95]})
# Inner join
merged = pd.merge(df1, df2, on='ID', how='inner')
# Result: ID=[1,2], Name=['A','B'], Score=[90,85]
# Pivot tables
pivot = df.pivot_table(
values='Salary',
index='Department',
aggfunc=['mean', 'count']
)
Visualize data insights with Matplotlib and Seaborn. Create compelling charts, plots, and dashboards.
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
# Set style
sns.set_style('darkgrid')
plt.rcParams['figure.figsize'] = (12, 6)
# Sample data
np.random.seed(42)
df = pd.DataFrame({
'Date': pd.date_range('2024-01-01', periods=100),
'Sales': np.random.randn(100).cumsum() + 100,
'Category': np.random.choice(['A', 'B', 'C'], 100),
'Revenue': np.random.uniform(1000, 5000, 100)
})
# Line plot
plt.figure(figsize=(12, 6))
plt.plot(df['Date'], df['Sales'], color='#39ff14', linewidth=2)
plt.title('Sales Over Time', fontsize=16, fontweight='bold')
plt.xlabel('Date')
plt.ylabel('Sales')
plt.grid(True, alpha=0.3)
plt.tight_layout()
# plt.savefig('sales_trend.png', dpi=300, bbox_inches='tight')
plt.show()
# Bar chart
category_sales = df.groupby('Category')['Revenue'].sum()
plt.figure(figsize=(10, 6))
category_sales.plot(kind='bar', color=['#39ff14', '#00f0ff', '#ff10f0'])
plt.title('Revenue by Category')
plt.xlabel('Category')
plt.ylabel('Total Revenue')
plt.xticks(rotation=0)
plt.tight_layout()
plt.show()
# Scatter plot with regression line
plt.figure(figsize=(10, 6))
sns.regplot(x='Sales', y='Revenue', data=df, color='#39ff14')
plt.title('Sales vs Revenue Correlation')
plt.tight_layout()
plt.show()
# Distribution plot (histogram + KDE)
plt.figure(figsize=(10, 6))
sns.histplot(df['Revenue'], bins=30, kde=True, color='#00f0ff')
plt.title('Revenue Distribution')
plt.xlabel('Revenue')
plt.ylabel('Frequency')
plt.tight_layout()
plt.show()
# Box plot
plt.figure(figsize=(10, 6))
sns.boxplot(x='Category', y='Revenue', data=df, palette='Set2')
plt.title('Revenue Distribution by Category')
plt.tight_layout()
plt.show()
# Heatmap (correlation matrix)
numeric_df = df[['Sales', 'Revenue']].corr()
plt.figure(figsize=(8, 6))
sns.heatmap(numeric_df, annot=True, cmap='coolwarm', center=0)
plt.title('Correlation Matrix')
plt.tight_layout()
plt.show()
# Subplots
fig, axes = plt.subplots(2, 2, figsize=(15, 10))
axes[0, 0].plot(df['Date'], df['Sales'])
axes[0, 0].set_title('Sales Trend')
axes[0, 1].bar(range(len(category_sales)), category_sales.values)
axes[0, 1].set_title('Category Sales')
axes[1, 0].scatter(df['Sales'], df['Revenue'], alpha=0.5)
axes[1, 0].set_title('Sales vs Revenue')
axes[1, 1].hist(df['Revenue'], bins=20, color='#39ff14', alpha=0.7)
axes[1, 1].set_title('Revenue Distribution')
plt.tight_layout()
plt.show()
Build, train, and evaluate machine learning models for classification, regression, and clustering tasks.
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.metrics import mean_squared_error, r2_score
import pandas as pd
import numpy as np
# Sample dataset (classification)
from sklearn.datasets import load_iris
iris = load_iris()
X, y = iris.data, iris.target
# Split data
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, random_state=42
)
# Feature scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
# Model 1: Logistic Regression
log_model = LogisticRegression(max_iter=200)
log_model.fit(X_train_scaled, y_train)
log_pred = log_model.predict(X_test_scaled)
log_accuracy = accuracy_score(y_test, log_pred)
print(f"Logistic Regression Accuracy: {log_accuracy:.3f}")
# Model 2: Random Forest
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)
rf_pred = rf_model.predict(X_test)
rf_accuracy = accuracy_score(y_test, rf_pred)
print(f"Random Forest Accuracy: {rf_accuracy:.3f}")
# Feature importance
feature_importance = pd.DataFrame({
'feature': iris.feature_names,
'importance': rf_model.feature_importances_
}).sort_values('importance', ascending=False)
print("\nFeature Importance:")
print(feature_importance)
# Confusion Matrix
cm = confusion_matrix(y_test, rf_pred)
print("\nConfusion Matrix:")
print(cm)
# Classification Report
print("\nClassification Report:")
print(classification_report(y_test, rf_pred, target_names=iris.target_names))
# Cross-validation
cv_scores = cross_val_score(rf_model, X, y, cv=5)
print(f"\nCross-validation scores: {cv_scores}")
print(f"Mean CV accuracy: {cv_scores.mean():.3f} (+/- {cv_scores.std():.3f})")
# Regression Example
from sklearn.datasets import make_regression
X_reg, y_reg = make_regression(n_samples=100, n_features=1, noise=10, random_state=42)
X_train_reg, X_test_reg, y_train_reg, y_test_reg = train_test_split(
X_reg, y_reg, test_size=0.2, random_state=42
)
# Linear Regression
lr_model = LinearRegression()
lr_model.fit(X_train_reg, y_train_reg)
lr_pred = lr_model.predict(X_test_reg)
# Evaluation metrics
mse = mean_squared_error(y_test_reg, lr_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test_reg, lr_pred)
print(f"\nRegression Metrics:")
print(f"MSE: {mse:.3f}")
print(f"RMSE: {rmse:.3f}")
print(f"Rยฒ Score: {r2:.3f}")
# Clustering Example
from sklearn.cluster import KMeans
kmeans = KMeans(n_clusters=3, random_state=42)
clusters = kmeans.fit_predict(X)
print(f"\nCluster centers:\n{kmeans.cluster_centers_}")
Introduction to neural networks using TensorFlow and Keras. Build your first deep learning models.
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import numpy as np
# Load MNIST dataset
(X_train, y_train), (X_test, y_test) = keras.datasets.mnist.load_data()
# Preprocess data
X_train = X_train.reshape(-1, 28*28) / 255.0 # Flatten and normalize
X_test = X_test.reshape(-1, 28*28) / 255.0
# Build neural network
model = keras.Sequential([
layers.Dense(128, activation='relu', input_shape=(784,)),
layers.Dropout(0.2), # Prevent overfitting
layers.Dense(64, activation='relu'),
layers.Dropout(0.2),
layers.Dense(10, activation='softmax') # 10 classes (digits 0-9)
])
# Compile model
model.compile(
optimizer='adam',
loss='sparse_categorical_crossentropy',
metrics=['accuracy']
)
# Display model architecture
print(model.summary())
# Train model
history = model.fit(
X_train, y_train,
epochs=10,
batch_size=32,
validation_split=0.2,
verbose=1
)
# Evaluate model
test_loss, test_accuracy = model.evaluate(X_test, y_test, verbose=0)
print(f"\nTest Accuracy: {test_accuracy:.4f}")
# Make predictions
predictions = model.predict(X_test[:5])
predicted_classes = np.argmax(predictions, axis=1)
print(f"Predicted classes: {predicted_classes}")
print(f"Actual classes: {y_test[:5]}")
# CNN for Image Classification
def create_cnn_model():
model = keras.Sequential([
layers.Conv2D(32, (3, 3), activation='relu', input_shape=(28, 28, 1)),
layers.MaxPooling2D((2, 2)),
layers.Conv2D(64, (3, 3), activation='relu'),
layers.MaxPooling2D((2, 2)),
layers.Conv2D(64, (3, 3), activation='relu'),
layers.Flatten(),
layers.Dense(64, activation='relu'),
layers.Dropout(0.5),
layers.Dense(10, activation='softmax')
])
return model
# Reshape for CNN (add channel dimension)
X_train_cnn = X_train.reshape(-1, 28, 28, 1)
X_test_cnn = X_test.reshape(-1, 28, 28, 1)
cnn_model = create_cnn_model()
cnn_model.compile(
optimizer='adam',
loss='sparse_categorical_crossentropy',
metrics=['accuracy']
)
# Train CNN
cnn_history = cnn_model.fit(
X_train_cnn, y_train,
epochs=5,
batch_size=64,
validation_split=0.2,
verbose=1
)
# Save model
# cnn_model.save('mnist_cnn_model.h5')
# Load model
# loaded_model = keras.models.load_model('mnist_cnn_model.h5')
Apply your data science skills to solve real business problems with these comprehensive projects.
Objective: Predict which customers are likely to cancel their subscription
Techniques: Logistic regression, random forest, XGBoost, feature engineering, ROC-AUC analysis
Dataset: Customer demographics, usage patterns, billing history, support tickets
Objective: Predict future sales for inventory management and planning
Techniques: Time series analysis, ARIMA, Prophet, LSTM networks, seasonality detection
Dataset: Historical sales data, promotions, holidays, external factors (weather, events)
Objective: Build a movie/product recommendation engine
Techniques: Collaborative filtering, content-based filtering, matrix factorization, neural networks
Dataset: User ratings, product features, user behavior, click streams
Objective: Build a CNN to classify images (e.g., medical images, products, animals)
Techniques: Transfer learning (VGG16, ResNet), data augmentation, fine-tuning, deployment
Dataset: Labeled image dataset (10,000+ images), validation/test splits
Objective: Analyze customer sentiment from reviews and social media
Techniques: NLP, VADER, transformers (BERT), topic modeling, visualization
Dataset: Product reviews, tweets, customer feedback, support tickets
Test your data science and ML knowledge with 20 random questions!