Data Analysis and Machine Learning Techniques with Python
1. California Housing Dataset: EDA and Outlier Analysis
Importing Libraries and Loading Data
The following libraries are imported for data manipulation, visualization, and dataset loading:
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from sklearn.datasets import fetch_california_housing
california = fetch_california_housing(as_frame=True)
df = california.frame
Identifying Numerical Features
numerical_features = df.select_dtypes(include=['number']).columns
Visualizing Data Distribution: Histograms
Histograms with Kernel Density Estimates (KDE) are used to visualize the distribution of all numerical features.
plt.figure(figsize=(15, 10))
for i, feature in enumerate(numerical_features):
plt.subplot(3, 3, i + 1)
sns.histplot(df , kde=True)
plt.title(f'Histogram of {feature}')
plt.tight_layout()
plt.show()
Visualizing Data Spread: Boxplots
Boxplots help identify the spread and potential outliers in each numerical feature.
plt.figure(figsize=(15, 10))
for i, feature in enumerate(numerical_features):
plt.subplot(3, 3, i + 1)
sns.boxplot(y=df )
plt.title(f'Boxplot of {feature}')
plt.tight_layout()
plt.show()
Outlier Detection using the IQR Method
A function is defined to detect outliers based on the Interquartile Range (IQR) method (values outside $Q1 – 1.5 \times IQR$ and $Q3 + 1.5 \times IQR$).
def detect_outliers_iqr(data):
q1 = data.quantile(0.25)
q3 = data.quantile(0.75)
iqr = q3 - q1
lower_bound = q1 - 1.5 * iqr
upper_bound = q3 + 1.5 * iqr
outliers = data[(data < lower_bound) | (data > upper_bound)]
return outliers
Analyzing Outliers for Each Feature
print("Outlier Analysis:")
for feature in numerical_features:
outliers = detect_outliers_iqr(df )
print(f"\nFeature: {feature}")
print(f"Number of outliers: {len(outliers)}")
if 0 < len(outliers) < 20:
print(f"Outlier values: {outliers.values}")
elif len(outliers) >= 20:
print("Too many outliers to display.")
else:
print("No outliers detected.")
2. Correlation Analysis of California Housing Data
Loading Data and Calculating Correlation Matrix
The California Housing dataset is reloaded, ensuring the target variable (Median House Value) is included for correlation analysis.
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.datasets import fetch_california_housing
data = fetch_california_housing()
df = pd.DataFrame(data.data, columns=data.feature_names)
df['MedHouseVal'] = data.target
correlation_matrix = df.corr()
print("Correlation Matrix:\n", correlation_matrix)
Visualizing Correlation with a Heatmap
A heatmap provides a clear visual representation of the linear relationships between all variables.
plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, fmt=".2f", cmap='coolwarm', square=True, linewidths=.5)
plt.title("Correlation Heatmap - California Housing Dataset")
plt.tight_layout()
plt.show()
Pair Plot for Feature Relationships
A pair plot is generated for a subset of key features to visualize their pairwise relationships and individual distributions.
selected_features = ['MedInc', 'HouseAge', 'AveRooms', 'AveOccup', 'MedHouseVal']
sns.pairplot(df[selected_features], corner=True, diag_kind='kde')
plt.suptitle("Pair Plot of Selected Features", y=1.02)
plt.tight_layout()
plt.show()
3. Dimensionality Reduction using PCA on the Iris Dataset
Importing Libraries and Loading Iris Data
import numpy as np
import pandas as pd
from sklearn.datasets import load_iris
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
iris = load_iris()
data = iris.data
labels = iris.target
label_names = iris.target_names
iris_df = pd.DataFrame(data, columns=iris.feature_names)
Applying PCA (2 Components)
Principal Component Analysis (PCA) is applied to reduce the dimensionality of the Iris dataset features from four to two components.
pca = PCA(n_components=2)
data_reduced = pca.fit_transform(data)
reduced_df = pd.DataFrame(data_reduced, columns=['Principal Component 1', 'Principal Component 2'])
reduced_df['Label'] = labels
Visualizing PCA Results
The reduced data is plotted in a 2D scatter plot, colored by the original species label, demonstrating separability.
plt.figure(figsize=(8, 6))
colors = ['r', 'g', 'b']
for i, label in enumerate(np.unique(labels)):
plt.scatter(
reduced_df[reduced_df['Label'] == label]['Principal Component 1'],
reduced_df[reduced_df['Label'] == label]['Principal Component 2'],
label=label_names[label],
color=colors[i]
)
plt.title('PCA on Iris Dataset')
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.legend()
plt.grid()
plt.show()
4. Implementation of the Find-S Algorithm
Defining the Find-S Function
The Find-S algorithm is a basic concept learning algorithm that finds the most specific hypothesis consistent with the positive training examples.
import pandas as pd
def find_s_algorithm():
data = pd.DataFrame({
'Sky': ['Sunny', 'Sunny', 'Cloudy', 'Rainy', 'Sunny'],
'Temperature': ['Warm', 'Hot', 'Warm', 'Cold', 'Warm'],
'Humidity': ['Normal', 'High', 'High', 'Normal', 'Normal'],
'Wind': ['Strong', 'Weak', 'Strong', 'Strong', 'Weak'],
'PlayTennis': ['Yes', 'No', 'Yes', 'No', 'Yes']
})
print("Training data:")
print(data)
attributes = data.columns[:-1]
class_label = data.columns[-1]
hypothesis = ['?' for _ in attributes]
for index, row in data.iterrows():
if row[class_label] == 'Yes':
for i, value in enumerate(row[attributes]):
if hypothesis[i] == '?' or hypothesis[i] == value:
hypothesis[i] = value
else:
hypothesis[i] = '?'
return hypothesis