This notebook contains solutions to the exercises from the “Working with Data” lecture.
ggplot(diamonds, aes(x = carat, y = price, color = cut)) +
geom_point(alpha = 0.5) +
labs(title = "Diamond Price vs. Carat",
subtitle = "Colored by cut quality",
x = "Carat",
y = "Price (USD)",
color = "Cut Quality") +
theme_minimal()
ggplot(mpg, aes(x = class, y = hwy, fill = class)) +
geom_boxplot() +
labs(title = "Highway MPG Distribution by Vehicle Class",
x = "Vehicle Class",
y = "Highway MPG",
fill = "Vehicle Class") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
library(ggplot2)
library(dplyr)
library(tidyr)
economics_long <- economics %>%
dplyr::select(date, psavert, uempmed) %>%
pivot_longer(cols = c(psavert, uempmed), names_to = "variable", values_to = "value")
ggplot(economics_long, aes(x = date, y = value, color = variable)) +
geom_line() +
scale_y_continuous(
name = "Personal Savings Rate (%)",
sec.axis = sec_axis(~./2, name = "Unemployment Rate (Weeks)")
) +
scale_color_manual(values = c("psavert" = "blue", "uempmed" = "red"),
labels = c("psavert" = "Personal Savings Rate", "uempmed" = "Unemployment Rate")) +
labs(title = "Personal Savings Rate and Unemployment Rate Over Time",
x = "Date",
color = "Measure") +
theme_minimal()
ggplot(diamonds, aes(x = price, fill = color)) +
geom_histogram(binwidth = 1000, position = "dodge") +
# geom_histogram(binwidth=1000) +
facet_wrap(~ cut, scales = "free_y") +
labs(title = "Distribution of Diamond Prices by Cut and Color",
x = "Price (USD)",
y = "Count",
fill = "Diamond Color") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
ggplot(mpg, aes(x = cty, y = hwy, color = drv)) +
geom_point() +
geom_smooth(method = "lm", se = FALSE) +
facet_wrap(~ cyl) +
labs(title = "City MPG vs. Highway MPG by Number of Cylinders",
subtitle = "Colored by Drive Type",
x = "City MPG",
y = "Highway MPG",
color = "Drive Type") +
theme_minimal()
`geom_smooth()` using formula = 'y ~ x'
top_9_cities <- txhousing %>%
group_by(city) %>%
summarize(median_price = median(median, na.rm = TRUE)) %>%
top_n(9, median_price) %>%
pull(city)
txhousing_top9 <- txhousing %>%
filter(city %in% top_9_cities)
ggplot(txhousing_top9, aes(x = date, y = median)) +
geom_line() +
facet_wrap(~ city, nrow = 3, scales = "free_y") +
labs(title = "Median Housing Price Over Time for Top 9 Texas Cities",
x = "Date",
y = "Median Price (USD)") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
library(ggplot2)
library(zoo)
# Create a data frame with date and psavert
economics_ts <- data.frame(
date = economics$date,
psavert = economics$psavert
)
# Calculate the 12-month moving average
economics_ts$ma_12 <- rollmean(economics_ts$psavert, k = 12, fill = NA, align = "right")
# Create the plot
ggplot(economics_ts, aes(x = date)) +
geom_line(aes(y = psavert), color = "blue") +
geom_line(aes(y = ma_12), color = "red", size = 1) +
labs(title = "Personal Savings Rate (1967-2015)",
subtitle = "With 12-month Moving Average",
x = "Date",
y = "Personal Savings Rate (%)") +
theme_minimal()
Warning: Removed 11 rows containing missing values or values outside the scale range (`geom_line()`).
# Load required libraries
library(ggplot2)
library(zoo)
# Create a time series object
nhtemp_ts <- ts(nhtemp, start = c(1912), end = c(1971), frequency = 1)
# Calculate a 5-year moving average for the trend
nhtemp_ma <- rollmean(nhtemp_ts, k = 5, fill = NA)
# Calculate the difference from the moving average
nhtemp_diff <- nhtemp_ts - nhtemp_ma
# Create a data frame for plotting
nhtemp_df <- data.frame(
year = time(nhtemp_ts),
temperature = as.vector(nhtemp_ts),
trend = as.vector(nhtemp_ma),
fluctuation = as.vector(nhtemp_diff)
)
# Plot the original time series and its components
ggplot(nhtemp_df, aes(x = year)) +
geom_line(aes(y = temperature), color = "blue") +
geom_line(aes(y = trend), color = "red", size = 1) +
labs(title = "New Haven Temperatures (1912-1971)",
x = "Year",
y = "Temperature (°F)") +
theme_minimal()
# Plot the fluctuations
ggplot(nhtemp_df, aes(x = year, y = fluctuation)) +
geom_line(color = "green") +
geom_hline(yintercept = 0, linetype = "dashed", color = "black") +
labs(title = "Temperature Fluctuations from 5-Year Moving Average",
x = "Year",
y = "Temperature Difference (°F)") +
theme_minimal()
library(forecast)
co2_ts <- ts(co2, start = c(1959, 1), frequency = 12)
co2_model <- auto.arima(co2_ts)
co2_forecast <- forecast(co2_model, h = 24)
autoplot(co2_forecast) +
labs(title = "CO2 Concentration Forecast",
x = "Year",
y = "CO2 Concentration (ppm)") +
theme_minimal()
library(MASS)
model <- lm(medv ~ rm, data = Boston)
summary(model)
ggplot(Boston, aes(x = rm, y = medv)) +
geom_point() +
geom_smooth(method = "lm", se = FALSE, color = "red") +
labs(title = "Median House Value vs. Average Number of Rooms",
x = "Average Number of Rooms",
y = "Median House Value ($1000s)") +
theme_minimal()
model <- lm(qsec ~ hp + wt + factor(am), data = mtcars)
summary(model)
par(mfrow = c(2, 2))
plot(model)
model <- lm(price ~ carat + cut + color + clarity, data = diamonds)
summary(model)
library(pROC)
# Prepare the data
mtcars$am <- as.factor(mtcars$am)
set.seed(123)
train_indices <- createDataPartition(mtcars$am, p = 0.7, list = FALSE)
train_data <- mtcars[train_indices, ]
test_data <- mtcars[-train_indices, ]
# Fit the logistic regression model
model <- glm(am ~ mpg + hp + wt, data = train_data, family = "binomial")
# Make predictions
predictions <- predict(model, newdata = test_data, type = "response")
predicted_classes <- ifelse(predictions > 0.5, 1, 0)
# Confusion matrix
conf_matrix <- table(Predicted = predicted_classes, Actual = test_data$am)
print(conf_matrix)
Actual
Predicted 0 1
0 5 0
1 0 3
# ROC curve
roc_curve <- roc(test_data$am, predictions)
Setting levels: control = 0, case = 1
Setting direction: controls < cases
plot(roc_curve, main = "ROC Curve for Transmission Type Prediction")
library(rattle)
data(wine)
# Prepare the data
set.seed(123)
train_indices <- createDataPartition(wine$Type, p = 0.7, list = FALSE)
train_data <- wine[train_indices, ]
test_data <- wine[-train_indices, ]
# Find the best k using cross-validation
k_values <- 1:20
accuracy <- sapply(k_values, function(k) {
model <- train(Type ~ ., data = train_data, method = "knn",
trControl = trainControl(method = "cv", number = 5),
tuneGrid = data.frame(k = k))
return(model$results$Accuracy)
})
best_k <- k_values[which.max(accuracy)]
# Train the final model with the best k
final_model <- train(Type ~ ., data = train_data, method = "knn",
trControl = trainControl(method = "cv", number = 5),
tuneGrid = data.frame(k = best_k))
# Evaluate on the test set
predictions <- predict(final_model, newdata = test_data)
confusionMatrix(predictions, test_data$Type)
Confusion Matrix and Statistics
Reference
Prediction 1 2 3
1 14 0 0
2 0 12 5
3 3 9 9
Overall Statistics
Accuracy : 0.6731
95% CI : (0.5289, 0.7967)
No Information Rate : 0.4038
P-Value [Acc > NIR] : 7.949e-05
Kappa : 0.5129
Mcnemar's Test P-Value : NA
Statistics by Class:
Class: 1 Class: 2 Class: 3
Sensitivity 0.8235 0.5714 0.6429
Specificity 1.0000 0.8387 0.6842
Pos Pred Value 1.0000 0.7059 0.4286
Neg Pred Value 0.9211 0.7429 0.8387
Prevalence 0.3269 0.4038 0.2692
Detection Rate 0.2692 0.2308 0.1731
Detection Prevalence 0.2692 0.3269 0.4038
Balanced Accuracy 0.9118 0.7051 0.6635
library(ISLR)
library(rpart)
library(rpart.plot)
library(randomForest)
# Prepare the data
set.seed(123)
train_indices <- createDataPartition(Default$default, p = 0.7, list = FALSE)
train_data <- Default[train_indices, ]
test_data <- Default[-train_indices, ]
# Decision Tree
tree_model <- rpart(default ~ ., data = train_data, method = "class")
rpart.plot(tree_model, extra = 101)
tree_predictions <- predict(tree_model, newdata = test_data, type = "class")
tree_cm <- confusionMatrix(tree_predictions, test_data$default)
print(tree_cm)
Confusion Matrix and Statistics
Reference
Prediction No Yes
No 2885 64
Yes 15 35
Accuracy : 0.9737
95% CI : (0.9673, 0.9791)
No Information Rate : 0.967
P-Value [Acc > NIR] : 0.0204
Kappa : 0.4578
Mcnemar's Test P-Value : 6.648e-08
Sensitivity : 0.9948
Specificity : 0.3535
Pos Pred Value : 0.9783
Neg Pred Value : 0.7000
Prevalence : 0.9670
Detection Rate : 0.9620
Detection Prevalence : 0.9833
Balanced Accuracy : 0.6742
'Positive' Class : No
# Random Forest
rf_model <- randomForest(default ~ ., data = train_data)
rf_predictions <- predict(rf_model, newdata = test_data)
rf_cm <- confusionMatrix(rf_predictions, test_data$default)
print(rf_cm)
Confusion Matrix and Statistics
Reference
Prediction No Yes
No 2884 70
Yes 16 29
Accuracy : 0.9713
95% CI : (0.9647, 0.977)
No Information Rate : 0.967
P-Value [Acc > NIR] : 0.09875
Kappa : 0.3902
Mcnemar's Test P-Value : 1.096e-08
Sensitivity : 0.9945
Specificity : 0.2929
Pos Pred Value : 0.9763
Neg Pred Value : 0.6444
Prevalence : 0.9670
Detection Rate : 0.9617
Detection Prevalence : 0.9850
Balanced Accuracy : 0.6437
'Positive' Class : No
# Compare performance
cat("Decision Tree Accuracy:", tree_cm$overall["Accuracy"], "\n")
Decision Tree Accuracy: 0.9736579
cat("Random Forest Accuracy:", rf_cm$overall["Accuracy"], "\n")
Random Forest Accuracy: 0.9713238