# Page 1: Data Processing and Visualization
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
def load_and_clean_data(filepath):
"""Loads data from a CSV file, cleans it, and returns a Pandas
DataFrame."""
try:
df = pd.read_csv(filepath)
except FileNotFoundError:
print(f"Error: File not found at {filepath}")
return None
# Basic data cleaning (example - adapt as needed)
df.dropna(inplace=True) # Remove rows with missing values
df.drop_duplicates(inplace=True) #Remove duplicate rows
#Convert a column to datetime
if 'date' in df.columns:
try:
df['date'] = pd.to_datetime(df['date'])
except ValueError:
print("Warning: Could not convert 'date' column to datetime.")
return df
def visualize_data(df, column1, column2, plot_type='scatter'):
"""Creates a visualization of the data."""
if df is None:
return
plt.figure(figsize=(8, 6)) # Adjust figure size as needed
if plot_type == 'scatter':
sns.scatterplot(x=column1, y=column2, data=df)
plt.title(f"Scatter Plot of {column1} vs {column2}")
plt.xlabel(column1)
plt.ylabel(column2)
elif plot_type == 'bar':
sns.barplot(x=column1, y=column2, data=df)
plt.title(f"Bar Plot of {column1} vs {column2}")
plt.xlabel(column1)
plt.ylabel(column2)
plt.xticks(rotation=45, ha='right') #Rotate x-axis labels if needed
elif plot_type == 'hist':
sns.histplot(df[column1])
plt.title(f"Histogram of {column1}")
plt.xlabel(column1)
plt.ylabel("Frequency")
else:
print("Invalid plot type. Choose from 'scatter', 'bar', or 'hist'.")
return
plt.tight_layout() #Adjust layout to prevent labels from overlapping
plt.show()
# Example usage:
filepath = "data.csv" # Replace with your file path
df = load_and_clean_data(filepath)
if df is not None:
print(df.head()) #Print first few rows
visualize_data(df, 'column1', 'column2', 'scatter') # Replace with your
column names
visualize_data(df, 'category_column', 'value_column', 'bar') # Example
of a bar chart
visualize_data(df, 'numerical_column', None, 'hist') # Example of a
histogram
#More analysis/manipulation below
#... # Page 2: Statistical Analysis and Machine Learning (Simplified)
import pandas as pd
from scipy import stats
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression # Example model
def perform_statistical_test(df, column1, column2, test_type='ttest'):
"""Performs a statistical test."""
if df is None:
return
if test_type == 'ttest':
t_statistic, p_value = stats.ttest_ind(df[column1], df[column2])
print(f"T-statistic: {t_statistic}")
print(f"P-value: {p_value}")
elif test_type == 'correlation':
correlation, p_value = stats.pearsonr(df[column1], df[column2])
print(f"Correlation coefficient: {correlation}")
print(f"P-value: {p_value}")
else:
print("Invalid test type. Choose from 'ttest' or 'correlation'.")
return
def train_and_evaluate_model(df, features, target):
"""Trains and evaluates a machine learning model."""
if df is None:
return
X = df[features] # Features (independent variables)
y = df[target] # Target variable (dependent variable)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,
random_state=42) #Split data
model = LinearRegression() #Example model - can be replaced
model.fit(X_train, y_train)
# Evaluation (example - adapt as needed)
score = model.score(X_test, y_test) # R-squared for Linear Regression
print(f"Model score: {score}")
return model #Return the trained model
# Example usage (continued from Page 1):
if df is not None:
perform_statistical_test(df, 'column1', 'column2', 'ttest') # Example t-
test
perform_statistical_test(df, 'column1', 'column2', 'correlation') #
Example correlation
features = ['feature1', 'feature2'] # Replace with your feature names
target = 'target_variable' # Replace with your target variable name
trained_model = train_and_evaluate_model(df, features, target)
#You can now use the trained model to make predictions
#...