Server Info & Course Info
# Create sample data
heights <- c(165, 170, 172, 168, 175, 169, 180, 167, 173, 171)
# Basic histogram
hist(heights)
hist() creates frequency distribution visualization# Add colors and labels
hist(heights,
col = "lightblue",
main = "Height Distribution",
xlab = "Height (cm)",
breaks = 5)
col controls bar colorsmain adds titlebreaks specifies number of bins# Generate normal data
normal_data <- rnorm(50)
# Generate non-normal data
non_normal <- rexp(30)
# Perform tests
shapiro.test(normal_data)
shapiro.test(non_normal)
# View mtcars structure
head(mtcars)
# Split data by cylinders
mpg_4cyl <- mtcars[mtcars$cyl == 4, "mpg"]
mpg_6cyl <- mtcars[mtcars$cyl == 6, "mpg"]
# Independent t-test
t_result <- t.test(mpg_4cyl, mpg_6cyl)
print(t_result)
# Check p-value
t_result$p.value
# Simulate data
before <- round(rnorm(20, 130, 10))
after <- before - rnorm(20, 8, 3)
# Create data frame
bp_data <- data.frame(before, after)
# Paired t-test
t.test(bp_data$before, bp_data$after, paired = TRUE)
# Calculate differences
diff <- bp_data$after - bp_data$before
hist(diff)
paired = TRUE is crucial parameter# Extract data
setosa <- iris[iris$Species == "setosa", "Petal.Width"]
versicolor <- iris[iris$Species == "versicolor", "Petal.Width"]
# Wilcoxon Rank Sum test
wilcox.test(setosa, versicolor)
# Store and examine results
wilcox_result <- wilcox.test(setosa, versicolor)
names(wilcox_result)
# Check test statistic and p-value
wilcox_result$statistic
wilcox_result$p.value
# Advertising budget vs Sales (simulated)
set.seed(123)
ad_spend <- seq(100, 1000, by=100)
sales <- 50 + 0.7*ad_spend + rnorm(10, 0, 30)
lm() function:
model <- lm(sales ~ ad_spend)
sales: Dependent variable (what we predict)ad_spend: Independent variable (what we use to predict)summary(model)
plot(ad_spend, sales, pch=19, col="blue",
main="Ad Spending vs Sales")
abline(model, col="red", lwd=2)
grid()
text(800, 400, paste("y =",
round(coef(model)[2],2),"x +",
round(coef(model)[1],2)))
residuals <- model$residuals
par(mfrow=c(2,2))
plot(model)
par(mfrow=c(1,1))
# If coefficient is 0.72:
cat("For every $1 increase in ad spending,
sales increase by", 0.72*100, "units")
“PCA helps simplify complex data while preserving patterns”
# Original iris data (first 2 rows)
head(iris[,1:4], 2)
# Sepal.Length Sepal.Width Petal.Length Petal.Width
# 1 5.1 3.5 1.4 0.2
# 2 4.9 3.0 1.4 0.2
# Standardized data
scaled_iris <- scale(iris[,1:4])
head(scaled_iris, 2)
# Sepal.Length Sepal.Width Petal.Length Petal.Width
# [1,] -0.8976739 1.01560199 -1.335752 -1.311052
# [2,] -1.1392005 -0.13153881 -1.335752 -1.311052
pca <- prcomp(
x = scaled_iris, # Standardized data
center = FALSE, # Already centered by scale()
scale. = FALSE # Already scaled
)
names(pca)
# [1] "sdev" "rotation" "center" "scale" "x"
pc_scores <- pca$x[,1:2]
head(pc_scores)
# PC1 PC2
# [1,] -2.257141 -0.4784238
# [2,] -2.074013 0.6718827
# [3,] -2.356335 0.3407664
summary(pca)
# Importance of components:
# PC1 PC2 PC3 PC4
# Standard deviation 1.7084 0.9560 0.38309 0.14393
# Proportion of Variance 0.7296 0.2285 0.03669 0.00518
plot(pc_scores,
col = c("red","green","blue")[iris$Species],
pch = 19, main = "PCA: Iris Species")
legend("topright", legend=levels(iris$Species),
fill=c("red","green","blue"), cex=0.7)
barplot(pca$sdev^2,
names.arg = paste0("PC",1:4),
main = "Variance per Component",
ylab = "Variance",
col = "skyblue")
variance_percent <- round(pca$sdev^2/sum(pca$sdev^2)*100, 1)
text(1:4, pca$sdev^2, labels=paste0(variance_percent,"%"), pos=3)
pca$rotation[,1]
# Sepal.Length Sepal.Width Petal.Length Petal.Width
# 0.5210659 -0.2693474 0.5804131 0.5648565
# 1. Standardize data
scaled_data <- scale(raw_data)
# 2. Run PCA
my_pca <- prcomp(scaled_data)
# 3. Extract components
components <- my_pca$x[,1:2]
# 4. Visualize
plot(components, col=groups)
barplot(my_pca$sdev^2)
</br> </br> </br> </br> </br>
head(iris)
# Sepal.Length Sepal.Width Petal.Length Petal.Width Species
# 1 5.1 3.5 1.4 0.2 setosa
# 2 4.9 3.0 1.4 0.2 setosa
We’ll use:
Different scales can distort distances:
summary(iris[,3:4])
# Petal.Length Petal.Width
# Min. :1.000 Min. :0.100
# Max. :6.900 Max. :2.500
Let’s standardize:
scaled_data <- scale(iris[,3:4])
head(scaled_data)
Basic syntax:
kmeans(x, centers, nstart)
Let’s make 3 clusters:
set.seed(123)
km_result <- kmeans(scaled_data, centers=3, nstart=10)
Key components:
names(km_result)
# [1] "cluster" "centers" "totss" "withinss"
# [5] "tot.withinss" "betweenss" "size"
km_result$size
# [1] 50 53 47
How to find optimal clusters:
sse <- numeric(10)
for(k in 1:10){
km <- kmeans(scaled_data, k)
sse[k] <- km$tot.withinss
}
plot(c(1:k), sse, type = "b", pch = 19, col = "steelblue",
main = "Elbow Method Visualization",
xlab = "Number of Clusters (k)",
ylab = "Total Within-Cluster SSE")
grid()
Compare our clusters with actual species:
table(Cluster=km_result$cluster, Species=iris$Species)
# Species
# Cluster setosa versicolor virginica
# 1 50 0 0
# 2 0 23 30
# 3 0 27 20
Color = Cluster, Shape = Species:
plot_data <- data.frame(scaled_data,
Cluster=factor(km_result$cluster),
Species=iris$Species)
colors <- c("red", "blue", "green", "orange", "purple")
shapes <- c(1, 2, 3)
cluster_colors <- colors[as.numeric(as.factor(plot_data$Cluster))]
species_shapes <- shapes[as.numeric(as.factor(plot_data$Species))]
plot(plot_data$Petal.Length, plot_data$Petal.Width,
col = cluster_colors,
pch = species_shapes,
xlab = "Petal Length",
ylab = "Petal Width",
main = "Petal.Length vs Petal.Width by Cluster & Species",
cex = 1.5)
Add cluster centers:
centers=km_result$centers
colors <- c("red", "blue", "green", "orange", "purple")
shapes <- c(1, 2, 3)
cluster_colors <- colors[as.numeric(as.factor(plot_data$Cluster))]
species_shapes <- shapes[as.numeric(as.factor(plot_data$Species))]
plot(plot_data$Petal.Length, plot_data$Petal.Width,
col = cluster_colors,
pch = species_shapes,
xlab = "Petal Length",
ylab = "Petal Width",
main = "Petal.Length vs Petal.Width by Cluster & Species",
cex = 1.5)
points(centers[,1], centers[,2],
col = "black",
pch = 8,
cex = 2.5)
</br> </br> </br> </br> </br>
Learning Objectives:
Why simulate data?
# Create 100 genes with 2 samples each
gene_data <- matrix(rnorm(600), ncol=6)
What does our matrix look like?
# First 5 rows
gene_data[1:5, ]
# [,1] [,2]
# [1,] 0.123 -1.234
# [2,] 1.456 0.789
# [3,] -0.345 2.101
# [4,] ... ...
Add meaningful labels:
colnames(gene_data) <- c("c1",'c2','c3', 't1','t2','t3')
rownames(gene_data) <- paste("Gene", 1:100, sep="_")
Run t-tests for all genes:
p_values <- apply(gene_data, 1, function(x) {
t.test(x[c(1:3)], x[c(4:6)])$p.value
})
Multiple testing correction:
adj_p <- p.adjust(p_values, method="BH")
head(adj_p)
# Gene_1 Gene_2 Gene_3 Gene_4 Gene_5
# 0.845 0.123 0.056 0.734 0.912
Compute log2 differences:
logFC <- apply(gene_data[,c(1:3)],1,mean) - apply(gene_data[,c(4:6)],1,mean)
results <- data.frame(logFC, p_values, adj_p)
Prepare for volcano plot:
results$color = rep('grey',nrow(gene_data))
results$color[which(results$p_values<0.1)]='blue'
results$color[which(results$adj_p<0.1)]='red'
Basic volcano plot:
plot(results$logFC, -log10(results$adj_p),
col = results$color, pch = 16,
xlab = "log2 Fold Change",
ylab = "-log10(adjusted p-value)")
Add finishing touches:
abline(h = -log10(0.05), col = "blue", lty=2)
abline(v = c(-1,1), col = "green", lty=2)
title("Differential Expression Results")