Appendix B — Code and Environments for Chapter 3
This chapter contains the code used for the analysis in chapter 3. The code is organized into six sections: R, R Environment, Python Training Code, Python Evaluation Code, and Python Environment. The R section contains the code for the analysis, the R Environment section contains the R environment used to run the analysis, the Python Training Code section contains the code used to train the model, the Python Evaluation Codei section contains the code to evaluate the model, and the Python Environment section contains the Python environment used to run the analysis.
R Code
library(tidyverse)
library(here)
library(magick)
library(sf)
library(Momocs)
library(umap)
library(randomForest)
library(SpatialKDE)
library(terra)
library(ggrepel)
library(ggnewscale)
library(ggfx)
library(colorspace)
library(viridis)
library(rnaturalearth)
run_hotellings <- function(groups, category, data) {
# Load necessary library
library(Hotelling)
vars <- names(data)
vars <- vars[!vars %in% c(category)]
# Initialize the results dataframe
results <- data.frame(
group1 = character(),
group2 = character(),
n_group1 = integer(),
n_group2 = integer(),
T_squared = numeric(),
p_value = numeric(),
stringsAsFactors = FALSE
)
# Loop over each pair of groups
for (i in 1:(length(groups) - 1)) {
for (j in (i + 1):length(groups)) {
group1 <- groups[i]
group2 <- groups[j]
if (group1 == group2) next
# Subset the data for the two groups
subset_data <- data[data[[category]] %in% c(group1, group2), ]
# Split the data into the two groups for Hotelling's T² test
group1_data <- subset_data[subset_data[[category]] == group1, vars]
group2_data <- subset_data[subset_data[[category]] == group2, vars]
# Calculate the sample size for each group
n_group1 <- nrow(group1_data)
n_group2 <- nrow(group2_data)
# Debugging: print the group names and sizes to verify
cat(
"Comparing", group1, "vs", group2, "- n_group1:",
n_group1, "n_group2:", n_group2, "\n"
)
# Check if there are enough samples for Hotelling's T²
if (n_group1 < 5 || n_group2 < 5) {
warning(paste(
"Skipping comparison for", group1, "vs", group2,
"due to insufficient data"
))
next
}
# Perform Hotelling's T² test
result <- tryCatch(
{
hotelling_result <- hotelling.test(group1_data, group2_data)
hotelling_result
},
error = function(e) {
warning(paste(
"Error in Hotelling's T² test for", group1, "vs",
group2, ":", e$message
))
return(NULL)
}
)
if (!is.null(result)) {
# Append results to the dataframe
results <- rbind(
results,
data.frame(
group1 = group1,
group2 = group2,
n_group1 = n_group1,
n_group2 = n_group2,
T_squared = result$stats$statistic,
p_value = result$pval
)
)
} else {
# Debugging: indicate that the result is NULL for this comparison
cat("No result for", group1, "vs", group2, "\n")
}
}
}
# Ensure p_value is numeric and round to 3 decimal places
results <- results %>%
mutate(p_value = as.numeric(p_value) %>%
round(3)) %>%
tibble::as_tibble()
# Check if results dataframe is empty and return a message if so
if (nrow(results) == 0) {
warning("No valid comparisons were made. Check your input data.")
}
return(results)
}
i_am("analysis/Chap3Analysis.R")
points <- rio::import("data/heurist_projectilePoints.csv") %>%
janitor::clean_names() %>%
as_tibble()
points %>% glimpse()
#### Type Collection ####
type_collection <- points %>%
filter(map_lgl(
project_name_title,
~ any(.x %in% c("Justice", "MNA Type Collection"))
)) %>%
unchop(project_name_title) %>%
mutate(original_point_type = as.character(original_point_type)) %>%
filter(!is.na(x1))
types <- table(type_collection$original_point_type) %>%
as.data.frame() %>%
filter(Freq > 5) %>%
pull(Var1) %>%
as.character()
landmarks <- points %>%
select(x1:y8) %>%
names()
plotdf <- type_collection %>%
rowid_to_column() %>%
filter(original_point_type %in% types) %>%
select(rowid, all_of(landmarks)) %>%
pivot_longer(any_of(starts_with("x")), names_to = "xl", values_to = "x") %>%
select(-any_of(landmarks)) %>%
bind_cols(type_collection %>%
rowid_to_column() %>%
filter(original_point_type %in% types) %>%
select(rowid, all_of(landmarks)) %>%
pivot_longer(any_of(starts_with("y")),
names_to = "yl", values_to = "y"
) %>%
select(-any_of(landmarks), -rowid)) %>%
left_join(type_collection %>% select(-any_of(landmarks)) %>%
rowid_to_column() %>% select(rowid, original_point_type, point_form))
plotside <- plotdf %>%
filter(point_form == "side-notched") %>%
filter(original_point_type != "Cottonwood Triangular")
mean_df <- plotside %>%
filter(original_point_type != "Cottonwood Triangular") %>%
select(-rowid) %>%
group_by(original_point_type, xl, yl) %>%
summarise_at(vars(x, y), mean, na.rm = T) %>%
mutate(original_point_type = str_remove_all(
original_point_type,
"Side-notched|Side Notched"
) %>% str_replace_all(" ", " ") %>%
as.character())
plotside %>%
mutate(original_point_type = str_remove_all(
original_point_type,
"Side-notched|Side Notched"
) %>% str_replace_all(" ", " ") %>%
as.character()) %>%
ggplot(aes(x, y, fill = original_point_type)) +
geom_point(size = 1, shape = 21, color = "white", alpha = .5) +
geom_point(
data = mean_df, aes(
x = x, y = y,
fill = original_point_type
),
shape = 21, size = 4, alpha = .8, color = "black"
) +
scale_fill_viridis_d() +
theme_bw() +
coord_fixed() +
guides(fill = "none") +
facet_wrap(~original_point_type) +
theme(
strip.background = element_rect(fill = "white"),
strip.text = element_text(size = 6)
)
nm <- "mean_landmarks_side"
# ggsave(file.path("figures",paste0(nm,".png")),
# dpi = 900, width = 170, units = "mm")
# file.path("figures",paste0(nm,".png")) %>% image_read() %>%
# image_trim() %>% image_border(geometry = "50x50", color = "white") %>%
# image_write(file.path("figures",paste0(nm,".png")))
plotmeanall <- plotside %>%
group_by(xl, yl) %>%
summarize_all(mean) %>%
janitor::remove_constant() %>%
mutate(label = str_remove_all(xl, "x"))
# magick::image_read(fn) %>% magick::image_trim() %>%
# magick::image_write(fn, density = 900)
# Landmark example #
point <- read_csv("data/meanpointexample.csv", show_col_types = F)
ggplot() +
geom_polygon(data = point, aes(x, y), color = "black", size = 1) +
geom_point(
data = plotmeanall, aes(x, y), shape = 16, color = "red",
alpha = .75, size = 8
) +
geom_text(
data = plotmeanall, aes(x, y, label = label), color = "white",
size = 4
) +
theme_void() +
coord_equal()
# fn = "figures/landmarks_example.png"
# ggsave(fn,dpi = 900, width = 4, height = 5)
# triangular
plottri <- plotdf %>%
filter(point_form == "triangular") %>%
mutate(original_point_type = str_remove_all(
original_point_type, "Triangular"
) %>%
str_replace_all(" ", " ") %>% as.character())
mean_df <- plottri %>%
select(-rowid) %>%
group_by(original_point_type, xl, yl) %>%
summarise_at(vars(x, y), mean, na.rm = T)
plottri %>%
ggplot(aes(x, y, fill = original_point_type)) +
geom_point(size = 1, shape = 21, color = "white", alpha = .5) +
geom_point(
data = mean_df, aes(x = x, y = y, fill = original_point_type),
shape = 23, size = 4, alpha = .8, color = "black"
) +
scale_fill_viridis_d() +
theme_bw() +
coord_equal() +
guides(fill = "none") +
facet_wrap(~original_point_type) +
theme(
strip.background = element_rect(fill = "white"),
strip.text = element_text(size = 7.5)
)
nm <- "mean_landmarks_tri"
# ggsave(file.path("figures",paste0(nm,".png")),
# dpi = 900, width = 170, units = "mm")
# file.path("figures",paste0(nm,".png")) %>%
# image_read() %>% image_trim() %>%
# image_border(geometry = "50x50", color = "white") %>%
# image_write(file.path("figures",paste0(nm,".png")))
# Examine side-notched types
type_side <- type_collection %>%
filter(point_form == "side-notched") %>%
filter(original_point_type != "Cottonwood Triangular")
types <- table(type_side$original_point_type) %>%
as.data.frame() %>%
filter(Freq > 5) %>%
pull(Var1) %>%
as.character()
type_side <- type_side %>%
filter(original_point_type %in% types)
u <- umap(type_side %>% select(any_of(landmarks)))
u_df <- u$layout %>%
as.data.frame() %>%
cbind(type_side %>%
select(original_point_type)) %>%
as_tibble() %>%
mutate(original_point_type = original_point_type %>% factor()) %>%
rename(`point type` = original_point_type, x = V1, y = V2)
u_df_sub <- u_df %>%
filter(`point type` %in% c("Awatovi", "Bonito", "Buck Taylor"))
# Create a custom legend for both color and shape by combining the aesthetics
ggplot(u_df, aes(y, x, color = `point type`, shape = `point type`)) +
geom_point(size = 3) +
scale_color_viridis_d() +
scale_shape_manual(values = c(15, 17, 18, 19, 19, 19, 19, 19, 19, 19)) +
stat_ellipse(
data = u_df_sub, aes(fill = `point type`),
geom = "polygon", level = 0.68, alpha = .1
) +
theme_bw() +
coord_equal() +
guides(
fill = guide_none(),
color = guide_legend(ncol = 3)
) +
theme(
legend.position = "bottom",
legend.title = element_blank(),
legend.text = element_text(size = 8),
legend.key.size = unit(0.4, "cm"),
legend.margin = ggplot2::margin(t = -20, 0, 0, 0)
) +
xlab("") +
ylab("")
nm <- "umap_side_original"
# ggsave(file.path("figures",paste0(nm,".png")),
# dpi = 900, width = 170, units = "mm")
#
# file.path("figures",paste0(nm,".png")) %>%
# image_read() %>% image_trim() %>%
# image_border(geometry = "50x50", color = "white") %>%
# image_write(file.path("figures",paste0(nm,".png")))
#
# browseURL(file.path("figures",paste0(nm,".png")))
groups <- levels(u_df$`point type`)
category <- "point type"
results_hot_side_notch_original_types <- run_hotellings(groups, category, u_df)
print(results_hot_side_notch_original_types %>% select(group1, group2, p_value))
tbl1 <- results_hot_side_notch_original_types %>%
filter(p_value > 0.1) %>%
mutate(form = "side-notched")
names(tbl1) <- c(
"Type 1", "Type 2", "Type 1 Total",
"Type 2 Total", "T-squared", "P-value", "Form"
)
# Examine triangular types
type_tri <- type_collection %>%
filter(point_form == "triangular")
types <- table(type_tri$original_point_type) %>%
as.data.frame() %>%
filter(Freq > 5) %>%
pull(Var1) %>%
as.character()
type_tri <- type_tri %>%
filter(original_point_type %in% types)
landmarks <- type_tri %>%
select(x1:y5) %>%
names()
u <- umap(type_tri %>% select(any_of(landmarks)))
u_df <- u$layout %>%
as.data.frame() %>%
cbind(type_tri %>%
select(original_point_type)) %>%
as_tibble() %>%
mutate(original_point_type = original_point_type %>% factor()) %>%
rename(`point type` = original_point_type, x = V1, y = V2)
ggplot(u_df, aes(x, y, color = `point type`, shape = `point type`)) +
geom_point(size = 3) +
scale_shape_manual(values = c(19, 17, 19, 15, 15, 18)) +
stat_ellipse(aes(fill = `point type`),
geom = "polygon",
level = 0.68, alpha = .1
) +
theme_bw() +
scale_color_viridis_d() +
coord_equal() +
theme(
legend.position = "bottom",
legend.title = element_blank(),
legend.margin = ggplot2::margin(-20, 0, 0, 0, unit = "pt")
) +
guides(color = guide_legend(ncol = 3)) +
xlab("") +
ylab("")
nm <- "umap_tri_original"
# ggsave(file.path("figures",paste0(nm,".png")),
# dpi = 900, width = 170, units = "mm")
#
# file.path("figures",paste0(nm,".png")) %>%
# image_read() %>% image_trim() %>%
# image_border(geometry = "50x50", color = "white") %>%
# image_write(file.path("figures",paste0(nm,".png")))
#
# browseURL(file.path("figures",paste0(nm,".png")))
groups <- levels(u_df$`point type`)
category <- "point type"
results_triangular_original_types <- run_hotellings(groups, category, u_df)
print(results_triangular_original_types %>% select(group1, group2, p_value))
tbl2 <- results_triangular_original_types %>%
filter(p_value > 0.1) %>%
mutate(form = "triangular")
names(tbl2) <- c(
"Type 1", "Type 2",
"Type 1 Total",
"Type 2 Total",
"T-squared",
"P-value",
"Form"
)
# write_csv(bind_rows(tbl1,tbl2),
# "tables/hotelling_test_results_original_types.csv")
#### Cluster Analysis of All Landmarks ####
# landmark shape clusters
points_sub <- points %>%
filter(point_form %in% c("side-notched", "triangular")) %>%
select(object_id, point_form, x1:y5) %>%
filter(!is.na(y5))
points_out <- lapply(1:nrow(points_sub), function(i) {
# Extract the relevant x, y columns for row i
row <- points_sub[i, c(
"x1", "y1", "x2", "y2", "x3", "y3", "x4", "y4",
"x5", "y5"
)]
# Convert to numeric and reshape into an 8x2 matrix (x, y pairs)
matrix(as.numeric(unlist(row)), ncol = 2, byrow = TRUE)
})
# project triangular and side-notched points into the same morphospace
points_out <- Out(points_out) %>%
Ldk() %>%
fgProcrustes()
points_out_df <- as_df(points_out)
u <- umap(points_out_df)
u_df <- u$layout %>%
as.data.frame() %>%
cbind(points_sub %>% select(object_id, point_form)) %>%
as_tibble()
ggplot(u_df, aes(V1, V2, color = point_form)) +
geom_point()
hc <- hclust(dist(u$layout, method = "minkowski"), method = "ward.D2")
plot(hc)
clusters <- cutree(hc, k = 3)
clusters <- kmeans(u$layout, centers = 3, nstart = 3)$cluster
u_df$cluster <- factor(clusters)
ggplot(u_df, aes(V1, V2, color = cluster)) +
geom_point() +
scale_color_viridis_d() +
coord_equal() +
theme_bw() +
xlab("") +
ylab("") +
theme(
legend.position = "bottom",
legend.margin = ggplot2::margin(-20, 0, 0, 0, unit = "pt")
)
nm <- "umap_landmark_cluster"
# ggsave(file.path("figures",paste0(nm,".png")),
# dpi = 900, width = 150, units = "mm")
#
# file.path("figures",paste0(nm,".png")) %>%
# image_read() %>%
# image_trim() %>%
# image_border(geometry = "50x50", color = "white") %>%
# image_write(file.path("figures",paste0(nm,".png")))
#### EFA Cluster analysis ####
k <- 3
rect_hclust_filled <- function(tree, k = NULL, which = NULL, x = NULL, h = NULL,
border = 2, cluster = NULL, alpha = 1) {
if (length(h) > 1L || length(k) > 1L) {
stop("'k' and 'h' must be a scalar")
}
if (!is.null(h)) {
if (!is.null(k)) {
stop("specify exactly one of 'k' and 'h'")
}
k <- min(which(rev(tree$height) < h))
k <- max(k, 2)
} else if (is.null(k)) {
stop("specify exactly one of 'k' and 'h'")
}
if (k < 2 || k > length(tree$height)) {
stop(gettextf("k must be between 2 and %d", length(tree$height)), domain = NA)
}
if (is.null(cluster)) {
cluster <- cutree(tree, k = k)
}
clustab <- table(cluster)[unique(cluster[tree$order])]
m <- c(0, cumsum(clustab))
if (!is.null(x)) {
if (!is.null(which)) {
stop("specify exactly one of 'which' and 'x'")
}
which <- x
for (n in seq_along(x)) which[n] <- max(which(m < x[n]))
} else if (is.null(which)) {
which <- 1L:k
}
if (any(which > k)) {
stop(gettextf("all elements of 'which' must be between 1 and %d", k), domain = NA)
}
border <- rep_len(border, length(which))
retval <- list()
for (n in seq_along(which)) {
fill_col <- adjustcolor(border[n], alpha.f = alpha)
rect(
m[which[n]] + 0.66,
par("usr")[3L],
m[which[n] + 1] + 0.33,
mean(rev(tree$height)[(k - 1):k]),
border = border[n],
col = fill_col
)
retval[[n]] <- which(cluster == as.integer(names(clustab)[which[n]]))
}
invisible(retval)
}
out <- readRDS(here("data/point_outlines.Rds"))
efa_out <- out %>%
fgProcrustes(tol = 0.1) %>%
Momocs::efourier()
pca_out <- efa_out %>% Momocs::PCA()
u <- umap(efa_out$coe)
u_df <- u$layout %>%
as.data.frame() %>%
rownames_to_column("object_id") %>%
as_tibble()
hc <- hclust(dist(u$layout, method = "minkowski"), method = "ward.D2")
# fn = file.path("figures","hclust.png")
# png(fn, width = 200, height = 100, res = 900, units = "mm")
# plot(
# hc,
# labels = FALSE,
# hang = -1,
# xlab = "",
# ylab = "",
# main = "",
# sub = "",
# yaxt = "n",
# cex.main = 1.5,
# cex.axis = 1.2
# )
# rect_hclust_filled(hc, k = k, border = c("#1b9e77", "#d95f02", "#7570b3"), alpha = .2)
# dev.off()
# image_read(fn) %>%
# image_trim() %>%
# image_border(geometry = "50x50", color = "white") %>%
# image_write(fn)
# browseURL(fn)
clusters <- cutree(hc, k = k)
u_df$cluster <- factor(clusters)
out$fac$cluster <- factor(clusters)
# mean shapes of clusters
out_split <- chop(out, fac = "cluster")
# Compute mean shapes for each category
mean_shapes_list <- lapply(out_split, function(coo_object) {
MSHAPES(coo_object$coo)
})
mean_shapes_list <- Momocs::combine(mean_shapes_list %>% Out())
mean_shapes_list$fac <- tibble(cluster = names(mean_shapes_list))
a <- coo_length(mean_shapes_list) / min(coo_length(mean_shapes_list))
mean_shapes_list <- coo_scale(mean_shapes_list, a)
stack(mean_shapes_list)
mean_shapes_df <- as_df(mean_shapes_list) %>%
mutate(coo = map(coo, as_tibble)) %>%
unnest(coo) %>%
group_split(cluster)
getbbox <- function(x, y, q = .75) {
q1 <- quantile(x, 1 - q)
q2 <- quantile(x, q)
q3 <- quantile(y, 1 - q)
q4 <- quantile(y, q)
xt <- x[x > q1 & x < q2]
yt <- y[y > q3 & y < q4]
return(c(mean(xt), mean(yt)))
}
u_df_group <- u_df %>%
group_split(cluster)
coords <- lapply(1:length(u_df_group), function(i) {
tmp <- u_df_group[[i]]
getbbox(tmp$V1, tmp$V2)
})
p <- list()
for (i in 1:length(mean_shapes_df)) {
tmp <- mean_shapes_df[[i]] %>%
ggplot(aes(x = V1, y = V2, group = cluster)) +
geom_polygon(fill = "#404040") +
theme_void() +
coord_equal()
tmp <- ggplotGrob(tmp)
p[[i]] <- tmp
}
library(grid)
library(cowplot)
u_df %>%
rename(x = V1, y = V2) %>%
ggplot(aes(x = y, y = x, color = cluster)) +
annotation_custom(
grob = p[[1]],
ymin = coords[[1]][1] - .75,
ymax = coords[[1]][1] + .75,
xmin = coords[[1]][2] - 1.5,
xmax = coords[[1]][2] + 1.5
) +
annotation_custom(
grob = p[[2]],
ymin = coords[[2]][1] - .75,
ymax = coords[[2]][1] + .75,
xmin = coords[[2]][2] - 1.5,
xmax = coords[[2]][2] + 1.5
) +
annotation_custom(
grob = p[[3]],
ymin = coords[[3]][1] - .75,
ymax = coords[[3]][1] + .75,
xmin = coords[[3]][2] - 1.5,
xmax = coords[[3]][2] + 1.5
) +
geom_point(size = .75, alpha = 0.66) +
stat_ellipse(aes(fill = cluster), geom = "polygon", level = 0.68, alpha = .1) +
theme_bw() +
scale_color_viridis_d() +
coord_equal() +
# remove x and y labels
theme(
axis.title.x = element_blank(),
axis.title.y = element_blank(),
axis.ticks = element_blank(),
legend.margin = ggplot2::margin(-10, 0, 0, 0, unit = "pt"),
) +
theme(legend.position = "bottom")
nm <- "cluster_shapes_all"
# ggsave(file.path("figures",paste0(nm,".png")),
# dpi = 900, width = 170, height = 100, units = "mm")
# file.path("figures",paste0(nm,".png")) %>%
# image_read() %>% image_trim() %>%
# image_border(geometry = "50x50", color = "white") %>%
# image_write(file.path("figures",paste0(nm,".png")))
# browseURL(file.path("figures",paste0(nm,".png")))
# check cluster assignments
tst <- manova(cbind(u_df$V1, u_df$V2) ~ cluster, data = u_df)
summary(tst, test = "Wilks")
run_hotellings(levels(u_df$cluster), "cluster", u_df[, c("cluster", "V1", "V2")])
points_update <- points %>%
select(`H-ID` = projectile_point_h_id, object_id) %>%
inner_join(u_df %>% select(object_id, cluster)) %>%
rowwise() %>%
mutate(cluster = paste0("shape-", cluster))
#### examine base shapes using landmarks ####
# calculate the angle between the base points y3 and y5
points <- points %>%
rowwise() %>%
mutate(base_slope = atan((y5 - y3) / (x5 - x3)) * 180 / pi)
points %>%
pull(base_slope) %>%
summary()
points %>%
pull(base_slope) %>%
hist()
points %>%
filter(!is.na(base_slope)) %>%
filter(!base_type %in% c("", "unknown")) %>%
mutate(base_type = factor(base_type,
levels = c("deeply concave", "concave", "straight", "convex")
)) %>%
ggplot(aes(x = base_slope, fill = base_type)) +
geom_boxplot()
points %>%
filter(!is.na(base_slope)) %>%
filter(!base_type %in% c("", "unknown")) %>%
group_by(base_type) %>%
summarize(
mean_slope = mean(base_slope, na.rm = T),
quantile1 = quantile(base_slope, .25, na.rm = T),
quantile3 = quantile(base_slope, .75, na.rm = T)
)
points <- points %>%
filter(basal_notch == "") %>%
mutate(base_type_updated = case_when(
base_slope < -18 ~ "deeply concave",
base_slope < -5 ~ "concave",
base_slope > 5 ~ "convex",
TRUE ~ "straight"
))
points %>%
filter(!is.na(base_slope)) %>%
filter(!base_type %in% c("", "unknown")) %>%
mutate(base_type_updated = factor(base_type_updated,
levels = c("deeply concave", "concave", "straight", "convex")
)) %>%
ggplot(aes(x = base_slope, fill = base_type_updated)) +
geom_boxplot()
diff <- points %>%
filter(!is.na(base_slope)) %>%
filter(!base_type %in% c("", "unknown")) %>%
filter(base_type != base_type_updated) %>%
select(object_id, base_type, base_type_updated, base_slope, basal_notch)
diff <- diff %>%
ungroup() %>%
slice(175:728) %>%
filter(base_type_updated == "deeply concave")
# write_tsv(diff, "C://users/rjbischo/Downloads/base_type_diff.tsv", na = "")
#### Notch Heights ####
points_side <- points %>%
filter(point_form == "side-notched") %>%
select(object_id, point_type, x1:y8) %>%
filter(!is.na(x6)) %>%
rowwise() %>%
mutate(
notch_height_bottom =
dist(matrix(c(x3, y3, x8, y8), ncol = 2, byrow = T)) %>% as.numeric(),
length = dist(matrix(c(x1, y1, x3, y3), ncol = 2, byrow = T)) %>%
as.numeric(),
notch_ratio = length / notch_height_bottom
) %>%
ungroup()
fn <- file.path("figures", "notch_ratio.png")
# png(fn, width = 150, height = 100, res = 900, units = "mm")
hist(points_side$notch_ratio, breaks = 100, xlab = "", main = "")
low <- quantile(points_side$notch_ratio, .025)
high <- quantile(points_side$notch_ratio, .67)
abline(v = low, col = "red", lwd = 2, lty = 2)
abline(v = high, col = "red", lwd = 2, lty = 2)
# dev.off()
# image_read(fn) %>% image_trim() %>%
# image_border(geometry = "50x50", color = "white") %>%
# image_write(fn)
points_side <- points_side %>%
mutate(
notch_height = case_when(
notch_ratio < low ~ "high",
notch_ratio > high ~ "low",
TRUE ~ "mid"
),
point_type = paste0(point_type, "-", notch_height)
) %>%
mutate(point_type = case_when(
str_detect(point_type, "-high") ~ "side-notched-high", TRUE ~ point_type
))
table(points_side$notch_height)
#### Assigning Point Types ####
points <- points %>%
rowwise() %>%
mutate(
b = ifelse(basal_notch == "Yes", "basal_notch", ""),
s = ifelse(point_form == "side-notched", "", serrated),
s = ifelse(point_form == "triangular" & s == "Yes", "serrated", ""),
point_type_old = point_type
) %>%
mutate(point_type = paste(
c(point_cluster, base_type, point_form, b, notch_height, s),
collapse = "_"
)) %>%
ungroup() %>%
# select(-b,-s) %>%
mutate(
point_type = str_remove_all(point_type, "_NA"),
point_type = str_remove_all(point_type, "^_|_$"),
point_type = str_remove_all(point_type, "^_|_$"),
point_type = str_replace_all(point_type, "__", "_"),
point_type = str_remove_all(point_type, "^_|_$")
) %>%
mutate(
point_type = ifelse(!point_form %in% c("side-notched", "triangular"),
"undetermined", point_type
),
point_type = ifelse(point_cluster == "", "undetermined", point_type),
point_type = ifelse(str_detect(point_type, "undetermined|unknown|archaic"),
"undetermined", point_type
)
)
table(points$point_type)
length(points$point_type %>% unique())
points %>%
filter(point_type_old != point_type) %>%
select(`H-ID` = projectile_point_h_id, object_id, point_type) %>%
write_tsv(file.path("data/point_types.tsv"), na = "")
valid_types <- points %>%
filter(point_type != "undetermined") %>%
group_by(point_form, point_type) %>%
count()
table(valid_types$point_form)
valid_types %>%
filter(n == 1) %>%
pull(point_type) %>%
paste(collapse = "\n") %>%
cat()
valid_types %>%
ungroup() %>%
filter(point_form == "side-notched") %>%
pull(n) %>%
summary()
valid_types %>%
ungroup() %>%
filter(point_form == "triangular") %>%
pull(n) %>%
summary()
tmp <- points %>%
filter(
original_point_type %in% c(
"Awatovi",
"Bonito",
"Citrus Side-notched",
"Cohonino Triangular",
"Cottonwood Triangular",
"Desert Side Notched",
"Snaketown Triangular Concave Base",
"Snaketown Triangular Straight-base",
"Sobaipuri",
"Temporal"
)
)
m <- table(tmp$point_type, tmp$original_point_type) %>% as.matrix()
m[m < 3] <- 0
m <- m[rowSums(m != 0) > 0, ]
m <- m %>% t()
m <- m[rowSums(m != 0) > 0, ]
fn <- "figures/original_point_type_matrix.png"
# png(fn, width = 6, height = 6, res = 900, units = "in")
# pheatmap::pheatmap(m, display_numbers = T,
# number_format = "%.0f", legend = F, angle_col = 315)
# dev.off()
# # trim white space
# image_read(fn) %>% image_trim() %>%
# image_border(geometry = "50x50", color = "white") %>%
# image_write(fn)
#### Hotellings tests to discriminate between types ####
points_sub <- points %>%
filter(point_form == "side-notched") %>%
filter(point_type != "undetermined") %>%
select(object_id, point_type, x1:y8) %>%
filter(!is.na(y8))
landmarks <- points_sub %>% # nolint
dplyr::select(x1:y8) %>%
names()
groups <- points_sub$point_type %>%
unique() %>%
sort()
u <- umap(points_sub %>% select(any_of(landmarks)))
udf <- u$layout %>%
as.data.frame() %>%
cbind(points_sub %>%
select(object_id, point_type)) %>%
as_tibble() %>%
mutate(point_type = point_type %>% factor()) %>%
rename(x = V1, y = V2)
results_hot_side <- run_hotellings(groups, "point_type", udf %>%
select(-object_id))
results_hot_side
results_hot_side %>%
filter(p_value < .1) %>%
nrow() / nrow(results_hot_side)
points_sub <- points %>%
filter(point_form == "triangular") %>%
filter(point_type != "undetermined") %>%
select(object_id, point_type, x1:y5) %>%
filter(!is.na(y5))
landmarks <- points_sub %>%
select(x1:y5) %>%
names()
groups <- points_sub$point_type %>%
unique() %>%
sort()
u <- umap(points_sub %>% select(any_of(landmarks)))
udf <- u$layout %>%
as.data.frame() %>%
cbind(points_sub %>%
select(object_id, point_type)) %>%
as_tibble() %>%
mutate(point_type = point_type %>% factor()) %>%
rename(x = V1, y = V2)
results_hot_tri <- run_hotellings(groups, "point_type", udf %>%
select(-object_id))
results_hot_tri
results_hot_tri %>%
filter(p_value < .1) %>%
nrow() / nrow(results_hot_tri)
points_sub <- points %>%
select(object_id, point_cluster, x1:y5) %>%
filter(!is.na(y5)) %>%
filter(!point_cluster %in% c(""))
landmarks <- points_sub %>%
select(x1:y5) %>%
names()
groups <- points_sub$point_cluster %>%
unique() %>%
sort()
u <- umap(points_sub %>% select(any_of(landmarks)))
udf <- u$layout %>%
as.data.frame() %>%
cbind(points_sub %>%
select(object_id, point_cluster)) %>%
as_tibble() %>%
mutate(point_cluster = point_cluster %>% factor()) %>%
rename(x = V1, y = V2)
results_hot_shape <- run_hotellings(groups, "point_cluster", udf %>%
select(-object_id))
results_hot_shape
points_sub <- points %>%
filter(notch_height != "") %>%
select(object_id, notch_height, x1:y8) %>%
filter(!is.na(y8))
landmarks <- points_sub %>%
select(x1:y8) %>%
names()
groups <- points_sub$notch_height %>%
unique() %>%
sort()
u <- umap(points_sub %>% select(any_of(landmarks)))
udf <- u$layout %>%
as.data.frame() %>%
cbind(points_sub %>%
select(object_id, notch_height)) %>%
as_tibble() %>%
mutate(notch_height = notch_height %>% factor()) %>%
rename(x = V1, y = V2)
results_hot_notch <- run_hotellings(groups, "notch_height", udf %>%
select(-object_id))
results_hot_notch
# easily distinguishes between notch heights
points_sub <- points %>%
select(object_id, base_type, x1:y5) %>%
filter(!is.na(y5)) %>%
filter(!base_type %in% c("", "unknown"))
landmarks <- points_sub %>%
select(x1:y5) %>%
names()
groups <- points_sub$base_type %>%
unique() %>%
sort()
u <- umap(points_sub %>% select(any_of(landmarks)))
udf <- u$layout %>%
as.data.frame() %>%
cbind(points_sub %>%
select(object_id, base_type)) %>%
as_tibble() %>%
mutate(base_type = base_type %>% factor()) %>%
rename(x = V1, y = V2)
results_hot_base <- run_hotellings(groups, "base_type", udf %>%
select(-object_id))
results_hot_base
table <- bind_rows(
results_hot_side,
results_hot_tri, results_hot_shape,
results_hot_notch, results_hot_base
) %>%
rename(
`Group 1` = group1, `Group 2` = group2,
`Group 1 Total` = n_group1, `Group 2 Total` = n_group2,
`T-squared` = T_squared, `P-value` = p_value
) %>%
distinct_all() %>%
filter(`P-value` > .1) %>%
filter(`Group 1 Total` > 9) %>%
filter(`Group 2 Total` > 9)
table
# write_csv(table,"tables/hotelling_test_results_all.csv")
table %>%
separate(`Group 1`, c("a1", "b1", "c1", "d1", "e1"), sep = "_", remove = F) %>%
separate(`Group 2`, c("a2", "b2", "c2", "d2", "e2"), sep = "_", remove = F) %>%
mutate(rowid = 1:n()) %>%
select(
all_of(c("rowid", "Group 1", "Group 2")),
a1, b1, c1, d1, e1, a2, b2, c2, d2, e2
) %>%
mutate_all(replace_na, "") %>%
mutate(matches = rowSums(
across(c(a1, b1, c1, d1, e1)) == across(c(a2, b2, c2, d2, e2))
))
#### compute mean shapes ####
types1 <- table(points %>% filter(point_type != "undetermined") %>%
filter(point_form == "side-notched") %>%
pull(point_type)) %>%
as.data.frame() %>%
arrange(desc(Freq)) %>%
head(9) %>%
pull(Var1)
types2 <- table(points %>% filter(point_type != "undetermined") %>%
filter(point_form == "triangular") %>%
pull(point_type)) %>%
as.data.frame() %>%
arrange(desc(Freq)) %>%
head(9) %>%
pull(Var1)
types <- c(types1, types2)
points_sub <- points %>%
filter(point_type != "undetermined")
ntotal <- points_sub %>% nrow()
nsub <- points_sub %>%
filter(point_type %in% types) %>%
nrow()
nsub / ntotal
out <- readRDS(here("data/point_outlines.Rds"))
out$fac <- out$fac %>%
select(object_id) %>%
left_join(points) %>%
distinct(object_id, .keep_all = T)
out_sub <- out %>%
filter(point_type %in% types) %>%
mutate(point_type = factor(point_type))
x <- coo_length(out_sub)
f <- x / max(x)
out_sub <- coo_scale(out_sub, f)
# Split the object by 'point_type' using chop
out_split <- chop(out_sub, fac = "point_type")
# Compute mean shapes for each category
mean_shapes_list <- lapply(out_split, function(coo_object) {
MSHAPES(coo_object$coo)
})
# Combine the mean shapes into a new 'Out' object
mean_shapes <- Momocs::combine(mean_shapes_list %>% Out())
# Check the resulting object
mean_shapes$fac <- tibble(type = names(mean_shapes) %>% factor()) %>%
mutate(form = ifelse(str_detect(type, "side-notched"), "SN", "Tri"))
mean_shapes_df <- mean_shapes %>%
as_df() %>%
mutate(coo = map(coo, as_tibble)) %>%
unnest(coo)
mean_shapes_df <- mean_shapes_df %>%
group_by(type) %>%
mutate(row_number = row_number()) %>%
filter(row_number == 1) %>%
bind_rows(mean_shapes_df, .) %>%
arrange(type, row_number) %>%
select(-row_number)
generate_letter_combinations <- function(n) {
if (n <= 26) {
return(letters[1:n])
} else {
# Single letters (a-z)
single_letters <- letters
# Double letters (aa, ab, ac, ..., zz) in the correct order
double_letters <- expand_grid(first = letters, second = letters) %>%
mutate(combination = paste0(first, second)) %>%
pull(combination)
# Combine single and double letters and take the first 'n'
return(c(single_letters, double_letters)[1:n])
}
}
unique_types <- mean_shapes_df %>%
filter(form == "SN") %>%
pull(type) %>%
unique() %>%
sort()
letters_combined1 <- generate_letter_combinations(length(unique_types))
# Create the tibble
dict1 <- tibble(form = "SN", type = unique_types, letter1 = letters_combined1)
unique_types <- mean_shapes_df %>%
filter(form == "Tri") %>%
pull(type) %>%
unique() %>%
sort()
letters_combined2 <- generate_letter_combinations(length(unique_types))
# Create the tibble
dict2 <- tibble(form = "Tri", type = unique_types, letter2 = letters_combined2)
mean_shapes_df <- mean_shapes_df %>%
left_join(dict1) %>%
left_join(dict2) %>%
mutate(letter = ifelse(is.na(letter1), letter2, letter1)) %>%
mutate(letter = factor(letter,
levels =
unique(sort(c(letters_combined1, letters_combined2)))
))
hex_colors <- hcl.colors(length(letters_combined1), palette = "Spectral")
hcl_colors <- coords(as(hex2RGB(hex_colors), "polarLUV"))
hcl_colors[, "C"] <- hcl_colors[, "C"] * 1.25
saturated_colors <- hcl(hcl_colors[, "H"], hcl_colors[, "C"], hcl_colors[, "L"])
df_grps <- mean_shapes_df %>%
group_split(form)
g1 <- ggplot(data = df_grps[[1]]) +
geom_polygon(aes(x = x, y = y, group = type, fill = letter),
alpha = .85, linewidth = 1, color = "black"
) +
# scale_fill_viridis_d() +
scale_fill_manual(values = saturated_colors) +
theme_void() +
coord_fixed(clip = "off") +
guides(fill = "none") +
theme(
strip.text = element_text(size = 12, face = "bold"),
element_rect(fill = "white")
) +
facet_wrap(~letter, ncol = 3) +
ggtitle("Side-Notched Points")
g2 <- ggplot(data = df_grps[[2]]) +
geom_polygon(aes(x = x, y = y, group = type, fill = letter),
alpha = .85, linewidth = 1, color = "black"
) +
# scale_fill_viridis_d() +
scale_fill_manual(values = saturated_colors) +
theme_void() +
coord_fixed(clip = "off") +
guides(fill = "none") +
theme(
strip.text = element_text(size = 12, face = "bold"),
element_rect(fill = "white")
) +
facet_wrap(~letter, ncol = 3) +
ggtitle("Triangular Points")
g <- cowplot::plot_grid(g1, g2, ncol = 2)
g
nm <- "mean_types"
# ggsave(file.path("figures",paste0(nm,".png")), dpi = 900,
# width = 170, units = "mm")
# file.path("figures",paste0(nm,".png")) %>% image_read() %>%
# image_trim() %>% image_border(geometry = "50x50", color = "white") %>%
# image_write(file.path("figures",paste0(nm,".png")))
dict <- df_grps[[1]] %>%
distinct(form, type, letter) %>%
arrange(form)
paste(
"For side-notched types:",
map2_chr(dict$type, dict$letter, function(x, y) paste0(x, " (", y, ")")) %>%
paste(collapse = ", ")
) %>% cat()
dict <- df_grps[[2]] %>%
distinct(form, type, letter) %>%
arrange(form)
paste(
"For triangular types:",
map2_chr(dict$type, dict$letter, function(x, y) paste0(x, " (", y, ")")) %>%
paste(collapse = ", ")
) %>% cat()
#### Stack Points ####
types1 <- table(points %>% filter(point_type != "undetermined") %>%
filter(point_form == "side-notched") %>%
pull(point_type)) %>%
as.data.frame() %>%
arrange(desc(Freq)) %>%
head(9) %>%
pull(Var1)
types2 <- table(points %>% filter(point_type != "undetermined") %>%
filter(point_form == "triangular") %>%
pull(point_type)) %>%
as.data.frame() %>%
arrange(desc(Freq)) %>%
head(9) %>%
pull(Var1)
types <- c(types1, types2)
points_sub <- points %>%
filter(point_type != "undetermined")
ntotal <- points_sub %>% nrow()
nsub <- points_sub %>%
filter(point_type %in% types) %>%
nrow()
nsub / ntotal
out <- readRDS(here("data/point_outlines.Rds"))
# x = out_sub %>%
# filter(object_id == "NA680.R3.9-1")
# out = out %>%
# filter(object_id != "NA3673T.174-1")
# stack(x)
out$fac <- out$fac %>%
select(object_id) %>%
left_join(points) %>%
distinct(object_id, .keep_all = T)
out_sub <- out %>%
filter(point_type %in% types) %>%
mutate(point_type = factor(point_type))
out_aligned <- out_sub %>% fgProcrustes()
x <- coo_length(out_sub)
f <- x / max(x)
out_sub <- coo_scale(out_sub, f)
out_df <- out_aligned %>%
as_df() %>%
mutate(coo = map(coo, as_tibble)) %>%
unnest(coo)
out_df <- out_df %>%
group_by(point_type) %>%
rowid_to_column("row_number") %>%
filter(row_number == 1) %>%
bind_rows(out_df, .) %>%
arrange(point_type, row_number) %>%
mutate(row_number = row_number())
generate_letter_combinations <- function(n) {
if (n <= 26) {
return(letters[1:n])
} else {
# Single letters (a-z)
single_letters <- letters
# Double letters (aa, ab, ac, ..., zz) in the correct order
double_letters <- expand_grid(first = letters, second = letters) %>%
mutate(combination = paste0(first, second)) %>%
pull(combination)
# Combine single and double letters and take the first 'n'
return(c(single_letters, double_letters)[1:n])
}
}
unique_types <- out_df %>%
filter(point_form == "side-notched") %>%
pull(point_type) %>%
unique() %>%
sort()
letters_combined1 <- generate_letter_combinations(length(unique_types))
# Create the tibble
dict1 <- tibble(
point_form = "side-notched", point_type = unique_types,
letter1 = letters_combined1
)
unique_types <- out_df %>%
filter(point_form == "triangular") %>%
pull(point_type) %>%
unique() %>%
sort()
letters_combined2 <- generate_letter_combinations(length(unique_types))
# Create the tibble
dict2 <- tibble(
point_form = "triangular", point_type = unique_types,
letter2 = letters_combined2
)
out_df <- out_df %>%
left_join(dict1) %>%
left_join(dict2) %>%
mutate(letter = ifelse(is.na(letter1), letter2, letter1)) %>%
mutate(letter = factor(letter,
levels =
unique(sort(c(letters_combined1, letters_combined2)))
))
hex_colors <- hcl.colors(length(letters_combined1), palette = "Spectral")
hcl_colors <- coords(as(hex2RGB(hex_colors), "polarLUV"))
hcl_colors[, "C"] <- hcl_colors[, "C"] * 1.25
saturated_colors <- hcl(hcl_colors[, "H"], hcl_colors[, "C"], hcl_colors[, "L"])
df_grps <- out_df %>%
rename(x = V1, y = V2) %>%
arrange(point_type, row_number) %>%
group_split(point_form)
g1 <- ggplot(data = df_grps[[1]]) +
geom_path(aes(x = x, y = y, group = object_id, color = letter),
alpha = .85
) +
scale_color_manual(values = saturated_colors) +
theme_void() +
coord_fixed(clip = "off") +
guides(color = "none") +
theme(
strip.text = element_text(size = 12, face = "bold"),
plot.background = element_rect(
fill = "#e8eaed", color = "black", linewidth = 1.5
),
plot.title = element_text(hjust = .1)
) +
facet_wrap(~letter, ncol = 3) +
ggtitle("Side-Notched Points")
g2 <- ggplot(data = df_grps[[2]]) +
geom_path(aes(x = x, y = y, group = object_id, color = letter),
alpha = .85
) +
scale_color_manual(values = saturated_colors) +
theme_void() +
coord_fixed(clip = "off") +
guides(color = "none") +
theme(
strip.text = element_text(size = 12, face = "bold"),
plot.background = element_rect(
fill = "#e8eaed", color = "black", linewidth = 1.5
),
plot.title = element_text(hjust = .1)
) +
facet_wrap(~letter, ncol = 3) +
ggtitle("Triangular Points")
g <- cowplot::plot_grid(g1, g2, ncol = 2, align = "hv")
g
nm <- "types_stacked"
# ggsave(file.path("figures",paste0(nm,".png")),
# dpi = 900, width = 170, units = "mm")
#
# file.path("figures",paste0(nm,".png")) %>%
# image_read() %>%
# image_trim() %>%
# image_border(geometry = "50x50", color = "white") %>%
# image_write(file.path("figures",paste0(nm,".png")))
dict <- df_grps[[1]] %>%
distinct(point_form, point_type, letter) %>%
arrange(point_form)
paste(
"For side-notched types:",
map2_chr(dict$point_type, dict$letter, function(x, y) paste0(x, " (", y, ")")) %>%
paste(collapse = ", ")
) %>% cat()
dict <- df_grps[[2]] %>%
distinct(point_form, point_type, letter) %>%
arrange(point_form)
paste(
"For triangular types:",
map2_chr(dict$point_type, dict$letter, function(x, y) paste0(x, " (", y, ")")) %>%
paste(collapse = ", ")
) %>% cat()
#### Point Distributions ####
points <- rio::import(file.path("data/heurist_projectilePoints.csv")) %>%
janitor::clean_names() %>%
as_tibble()
points <- points %>%
filter(!point_type %in% c("undetermined"))
table(points$point_type) %>%
t() %>%
t()
# create hexagonal grids
site_data <- rio::import(file.path("data/heurist_sites.csv")) %>%
janitor::clean_names() %>%
as_tibble()
states <- rnaturalearth::ne_states() %>%
filter(name %in% c("Arizona", "New Mexico")) %>%
st_transform(26912)
sites <- site_data %>%
select(site_name, long, lat, start_date, end_date) %>%
filter(site_name != "") %>%
distinct_all() %>%
mutate_at(vars(long, lat), as.numeric) %>%
filter(!is.na(long)) %>%
st_as_sf(coords = c("long", "lat"), crs = 4326, remove = F) %>%
st_transform(26912)
point_freq <- points %>%
filter(site_name != "") %>%
select(site_name, point_type) %>%
group_by(site_name, point_type) %>%
count(name = "count") %>%
group_by(site_name) %>%
mutate(total = sum(count)) %>%
mutate(freq = count / total) %>%
ungroup() %>%
filter(total > 4)
point_freq <- inner_join(sites %>% select(site_name), point_freq)
sites <- sites %>%
filter(site_name %in% point_freq$site_name)
grid <- create_grid_hexagonal(sites, cell_size = 36000)
types <- point_freq$point_type %>%
unique() %>%
sort()
p <- list()
for (type in types) {
print(type)
plotdf <- point_freq %>%
filter(point_type == type)
total_type <- sum(plotdf$count)
if (total_type < 10) next # skip if less than 10 points
grid <- create_grid_hexagonal(sites, cell_size = 36000)
plotdf <- plotdf %>%
select(-site_name) %>%
st_join(grid %>% mutate(id = 1:n())) %>%
group_by(id) %>%
summarize(
count = sum(count),
total = sum(total), freq = count / total
) %>%
st_centroid()
r <- kde(plotdf %>% select(freq), 72000, grid = grid)
p[[type]] <- ggplot() +
geom_sf(data = states, fill = "white") +
# geom_sf(data = grid_data, aes(fill = freq)) +
geom_sf(data = r, aes(fill = kde_value)) +
geom_sf(
data = sites,
color = "white", alpha = 0.75, shape = 17, size = 1
) +
scale_fill_viridis_c() +
theme_minimal() +
theme(plot.margin = unit(c(2, 2, 2, 2), "mm")) + # Set margins to zero
labs(title = type, subtitle = paste(total_type, "total points")) +
coord_sf(xlim = c(150000, 850000), ylim = c(3600000, 4124890))
ggsave(file.path(paste0(
"figures/points_kde/",
type, "-kde_hex_binned.png"
)), width = 8, height = 4.5, dpi = 600)
}
point_freq <- points %>%
filter(site_name != "") %>%
filter(point_form %in% c("side-notched", "triangular")) %>%
select(site_name, point_form) %>%
group_by(site_name, point_form) %>%
count(name = "count") %>%
group_by(site_name) %>%
mutate(total = sum(count)) %>%
mutate(freq = count / total) %>%
ungroup() %>%
filter(total > 4)
point_freq <- inner_join(sites %>% select(site_name), point_freq)
sites <- sites %>%
filter(site_name %in% point_freq$site_name)
grid <- create_grid_hexagonal(sites, cell_size = 36000)
types <- point_freq$point_form %>%
unique() %>%
sort()
p <- list()
for (type in types) {
print(type)
plotdf <- point_freq %>%
filter(point_form == type)
total_type <- sum(plotdf$count)
if (total_type < 10) next # skip if less than 10 points
grid <- create_grid_hexagonal(sites, cell_size = 36000)
plotdf <- plotdf %>%
select(-site_name) %>%
st_join(grid %>% mutate(id = 1:n())) %>%
group_by(id) %>%
summarize(
count = sum(count),
total = sum(total), freq = count / total
) %>%
st_centroid()
r <- kde(plotdf %>% select(freq), 72000, grid = grid)
p[[type]] <- ggplot() +
geom_sf(data = states, fill = "white") +
geom_sf(data = r, aes(fill = kde_value)) +
geom_sf(
data = sites,
color = "white", alpha = 0.75, shape = 17, size = 1
) +
scale_fill_viridis_c() +
theme_minimal() +
theme(plot.margin = unit(c(2, 2, 2, 2), "mm")) + # Set margins to zero
labs(title = type, subtitle = paste(total_type, "total points")) +
coord_sf(xlim = c(150000, 850000), ylim = c(3600000, 4124890))
ggsave(file.path(paste0(
"figures/points_kde/",
type, "-kde_hex_binned.png"
)), width = 8, height = 4.5, dpi = 600)
}
point_freq <- points %>%
filter(site_name != "") %>%
filter(point_form %in% c("side-notched", "triangular")) %>%
select(site_name, point_cluster) %>%
group_by(site_name, point_cluster) %>%
count(name = "count") %>%
group_by(site_name) %>%
mutate(total = sum(count)) %>%
mutate(freq = count / total) %>%
ungroup() %>%
filter(total > 4)
point_freq <- inner_join(sites %>% select(site_name), point_freq)
sites <- sites %>%
filter(site_name %in% point_freq$site_name)
grid <- create_grid_hexagonal(sites, cell_size = 36000)
types <- point_freq$point_cluster %>%
unique() %>%
sort()
p <- list()
for (type in types) {
print(type)
plotdf <- point_freq %>%
filter(point_cluster == type)
total_type <- sum(plotdf$count)
if (total_type < 10) next # skip if less than 10 points
grid <- create_grid_hexagonal(sites, cell_size = 36000)
plotdf <- plotdf %>%
select(-site_name) %>%
st_join(grid %>% mutate(id = 1:n())) %>%
group_by(id) %>%
summarize(
count = sum(count),
total = sum(total), freq = count / total
) %>%
st_centroid()
r <- kde(plotdf %>% select(freq), 72000, grid = grid)
p[[type]] <- ggplot() +
geom_sf(data = states, fill = "white") +
geom_sf(data = r, aes(fill = kde_value)) +
geom_sf(data = sites, color = "white", alpha = 0.75, shape = 17, size = 1) +
scale_fill_viridis_c() +
theme_minimal() +
guides(fill = "none") +
theme(plot.margin = unit(c(2, 2, 2, 2), "mm")) + # Set margins to zero
labs(title = type, subtitle = paste(total_type, "total points")) +
coord_sf(xlim = c(150000, 850000), ylim = c(3600000, 4124890))
ggsave(
file.path(
paste0("figures/points_kde/", type, "-kde_hex_binned_nolegend.png")
),
width = 8, height = 4.5, dpi = 600, bg = "white"
)
}
combine_imgs <- function(imgs, nrow, file_name) {
imgs <- lapply(imgs, function(x) {
image_trim(x) %>%
image_border(geometry = "50x50", color = "white") %>%
image_scale("1500x")
})
if (length(imgs) %% 2 != 0) {
print("adding extra image to make even number of images")
info <- image_info(imgs[[1]])
width <- info$width
height <- info$height
imgs <- c(imgs, image_blank(
width = width, height = height, color = "white"
))
}
ncol <- length(imgs) / nrow
sq <- seq(1, length(imgs), ncol)
rows <- list()
for (i in 1:nrow) {
indx <- sq[i]:(sq[i] + ncol - 1)
print(indx)
rows[[i]] <- image_append(do.call(c, imgs[indx]))
}
final_image <- image_append(do.call(c, rows), stack = TRUE)
final_image <- image_scale(final_image, "4500x")
image_write(final_image, path = file_name, density = 900)
print(paste("saved to", file_name))
}
imgs <- lapply(types, function(x) {
image_read(
file.path(paste0(
"figures/points_kde/",
x, "-kde_hex_binned_nolegend.png"
))
)
})
# combine_imgs(imgs, nrow = 2,
# file_name = file.path("figures/combined_hexbins.png"))
# combine top 4 types for side-notch and triangular
types_tri <- points %>%
filter(point_form == "triangular") %>%
select(point_type) %>%
group_by(point_type) %>%
count() %>%
ungroup() %>%
slice_max(n = 6, order_by = n) %>%
pull(point_type)
types_side <- points %>%
filter(point_form == "side-notched") %>%
select(point_type) %>%
group_by(point_type) %>%
count() %>%
ungroup() %>%
slice_max(n = 6, order_by = n) %>%
pull(point_type)
# imgs_tri = lapply(types_tri,function(x)
# image_read(file.path(paste0(
# "figures/points_kde/",x,"-kde_hex_binned.png"
# ))))
# combine_imgs(imgs = imgs_tri, nrow = 3,
# file_name = file.path(
# "figures/combined_hexbins_triangular.png"
# ))
# imgs_side = lapply(types_side,function(x)
# image_read(file.path(paste0(
# "figures/points_kde/",x,"-kde_hex_binned.png"
# ))))
# combine_imgs(imgs = imgs_side, nrow = 3,
# file_name = file.path(
# "figures/combined_hexbins_side.png"
# ))
#### Community Detection ####
base <- readRDS(file.path("data/backgroundmap.Rds"))
base_tonto <- readRDS(file.path("data/backgroundmap_tonto.Rds"))
site_data <- rio::import(file.path("data/heurist_sites.csv")) %>%
janitor::clean_names() %>%
as_tibble()
states <- rnaturalearth::ne_states() %>%
filter(name %in% c("Arizona", "New Mexico")) %>%
st_transform(26912)
sites <- site_data %>%
select(site_name, long, lat, start_date, end_date) %>%
filter(site_name != "") %>%
distinct_all() %>%
mutate_at(vars(long, lat), as.numeric) %>%
filter(!is.na(long)) %>%
st_as_sf(coords = c("long", "lat"), crs = 4326, remove = F) %>%
st_transform(26912)
points_sub <- points %>%
left_join(sites %>% select(site_name, start_date, end_date)) %>%
filter(!point_type %in% c("undetermined")) %>%
filter(!site_name %in% c("", "Wupatki National Monument"))
d <- table(points_sub$site_name, points_sub$point_type) %>%
prop.table(margin = 1) %>%
as.matrix() %>%
{
. * 100
}
m <- dist(d, method = "manhattan") %>% as.matrix()
m <- 1 - (m / 200)
diag(m) <- 0
colnames(m)
hist(m)
edges <- m %>%
as.table() %>%
as.data.frame() %>%
setNames(c("from", "to", "weight")) %>%
filter(from != "to") %>%
filter(weight > quantile(weight, 0.75))
library(igraph)
g <- graph_from_data_frame(d = edges, directed = F)
g <- simplify(g)
plot(g)
communities <- cluster_louvain(g, weights = E(g)$weight, resolution = 1)
membership_df <- data.frame(
community = membership(communities)
) %>%
rownames_to_column("site_name")
sites_communities <- sites %>%
inner_join(membership_df) %>%
mutate(community = factor(community))
# mapview::mapview(sites_communities, zcol = "community")
coords <- st_coordinates(st_transform(sites_communities, 26912))
sites_communities_df <- sites_communities %>%
st_drop_geometry() %>%
mutate(x = coords[, 1], y = coords[, 2]) %>%
distinct(lat, long, .keep_all = T) %>%
mutate(community = factor(community))
tonto_boundary <- st_as_sfc(st_bbox(
c(
xmin = -111.04,
ymin = 33.62, xmax = -110.92, ymax = 33.68
),
crs = 4326
)) %>%
st_transform(26912) %>%
st_cast("POLYGON") %>%
st_sfc() %>%
st_sf() %>%
mutate(id = "tonto boundary")
tonto_sites <- sites_communities_df %>%
st_as_sf(coords = c("x", "y"), crs = 26912, remove = F) %>%
st_crop(tonto_boundary)
table(tonto_sites$site_name, tonto_sites$community)
# Define shape and color mappings
shape_mapping <- c("1" = 16, "2" = 17, "3" = 18, "4" = 15) # Shape codes
color_mapping <- c(
"1" = "#440154FF", "2" = "#FDE725FF",
"3" = "#31688EFF", "4" = "#35B779FF"
) # Colors
insert <- base_tonto +
geom_sf(data = tonto_sites, aes(
color = community,
shape = community
), alpha = 0.8, size = 3) +
scale_shape_manual(values = shape_mapping) +
scale_color_manual(values = color_mapping) +
coord_sf(
xlim = c(496289.7, 507420.6),
ylim = c(3720024.3, 3726678.8)
) +
guides(color = "none", shape = "none") +
theme(plot.background = element_rect(
fill = "white",
color = "red", linewidth = 2
)) +
ggtitle("Roosevelt")
insert
insert <- ggplotGrob(insert)
xmin_inset <- 600000
xmax_inset <- 900000
ymin_inset <- 3660000
ymax_inset <- 3770000
g <- base +
# ggplot() +
geom_point(data = sites_communities_df, aes(
x = x, y = y,
color = community, shape = community
), alpha = 0.7, size = 3) +
scale_shape_manual(values = shape_mapping) +
scale_color_manual(values = color_mapping) +
geom_sf(data = tonto_boundary, color = "white", fill = "red", alpha = 0.6) +
annotation_custom(
grob = insert,
xmin = 600000, xmax = 900000, ymin = 3660000, ymax = 3770000
) +
theme(legend.position = "top")
nm <- "point_type_boundaries"
# ggsave(file.path("figures",paste0(nm,".png")), dpi = 900, width = 170, units = "mm")
# file.path("figures",paste0(nm,".png")) %>% image_read() %>%
# image_trim() %>% image_border(geometry = "50x50", color = "white") %>%
# image_write(file.path("figures",paste0(nm,".png")))
#### Machine Learning Confusion Matrices ####
# Load libraries
library(tidyverse)
library(janitor)
library(readxl)
library(cowplot)
library(ggtext)
# Define model categories and corresponding metadata columns
models <- list(
bases = "base_type",
notches = "notch_height",
shape = "point_cluster",
serrated = "serrated"
)
notches <- rio::import("data/predicted_types_notches.xlsx") %>%
janitor::clean_names() %>%
as_tibble()
serrated <- rio::import("data/predicted_types_serrated.xlsx") %>%
janitor::clean_names() %>%
as_tibble()
shape <- rio::import("data/predicted_types_shape.xlsx") %>%
janitor::clean_names() %>%
as_tibble()
bases <- rio::import("data/predicted_types_bases.xlsx") %>%
janitor::clean_names() %>%
as_tibble() %>%
filter(base_type != "unknown")
dfs <- list(
bases = bases,
notches = notches,
shape = shape,
serrated = serrated
)
# Helper function to generate ggplot confusion matrix
plot_conf_mat <- function(df, model) {
print(model)
df <- tibble(truth = factor(df[[model]]), prediction = factor(df$predicted_type)) %>%
filter(!is.na(truth), !is.na(prediction))
tab <- table(df$truth, df$prediction)
acc <- round(sum(diag(tab)) / sum(tab), 3)
title <- str_replace_all(model, "_", " ") %>%
str_to_title()
as.data.frame(tab) %>%
rename(Truth = Var1, Predicted = Var2, n = Freq) %>%
ggplot(aes(x = Predicted, y = Truth, fill = n)) +
geom_tile(color = "white") +
geom_text(aes(label = n), color = "black", size = 4) +
scale_fill_gradient(low = "white", high = "#2c7fb8") +
theme_minimal(base_size = 10) +
theme(
axis.text.x = element_text(angle = 45, hjust = 1),
legend.position = "none",
plot.title = element_markdown(size = 11, face = "bold"),
plot.background = element_rect(fill = "white", color = "black", linewidth = 1.5)
) +
labs(
title = glue::glue("{title}<br><span style='font-size:10pt;'>Accuracy = {acc}</span>"),
x = "Predicted",
y = "True"
)
}
# Generate plots
plots <- map2(
dfs,
models,
plot_conf_mat
)
# Combine into single figure
final_plot <- plot_grid(plotlist = plots, ncol = 2, labels = "AUTO")
# Save to file
fn <- "figures/confusion_matrices_combined.png"
ggsave(fn, final_plot, width = 6.5, height = 8, dpi = 600)
image_read(fn) %>%
image_trim() %>%
image_border(geometry = "50x50", color = "white") %>%
image_write(fn)
#### export session info ####
# Get the R version
r_version <- paste0("R version=", R.version$major, R.version$minor, sep = ".")
# Get the loaded packages and their versions
loaded_packages <- sessionInfo()$otherPkgs
loaded_versions <- sapply(
loaded_packages,
function(x) paste(x$Package, x$Version, sep = "=")
)
# Write the R version and package versions to the file
writeLines(c(r_version, loaded_versions), con = "analysis/r_environment2.qmd")R Environment
R version=44.3. cowplot=1.1.3 Hotelling=1.0-8 corpcor=1.6.10 rnaturalearth=1.0.1 viridis=0.6.5 viridisLite=0.4.2 colorspace=2.1-1 ggfx=1.0.1 ggnewscale=0.5.1 ggrepel=0.9.6 terra=1.8-29 SpatialKDE=0.8.2 randomForest=4.7-1.2 umap=0.2.10.0 sf=1.0-19 magick=2.8.5 gt=0.11.1 flextable=0.9.7 NetworkDistance=0.3.4 vegan=2.6-10 lattice=0.22-6 permute=0.9-7 ggnetwork=0.5.13 igraph=2.1.4 rio=1.2.3 here=1.0.1 ggthemes=5.1.0 huxtable=5.5.7 kableExtra=1.4.0 Momocs=1.4.1 magrittr=2.0.3 lubridate=1.9.4 forcats=1.0.0 stringr=1.5.1 dplyr=1.1.4 purrr=1.0.4 readr=2.1.5 tidyr=1.3.1 tibble=3.2.1 ggplot2=3.5.1 tidyverse=2.0.0
Python Training Code
import os
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms
from PIL import Image
from torchvision.models import efficientnet_v2_s
import contextlib
import time
from sklearn.model_selection import train_test_split
def test_model(model, test_loader, device, criterion):
model.eval()
correct = 0
total = 0
test_loss = 0.0
with torch.no_grad():
for images, labels in test_loader:
images, labels = images.to(device), labels.to(device)
outputs = model(images)
loss = criterion(outputs, labels)
test_loss += loss.item()
_, predicted = outputs.max(1)
total += labels.size(0)
correct += predicted.eq(labels).sum().item()
avg_test_loss = test_loss / len(test_loader)
test_accuracy = 100 * correct / total
print(
f"Test Loss: {avg_test_loss:.4f}, Test Accuracy: {test_accuracy:.2f}%")
return avg_test_loss, test_accuracy
def train_and_save_model(class_type, form, include_types, version):
# Load the Excel file
data_path = "data/heurist_projectilePoints.csv"
df = pd.read_csv(data_path)
if isinstance(form, str):
form = [form]
df = df[df['Point Form'].isin(form)].copy()
df = df[df[class_type].isin(include_types)].copy()
df['Media file Name'] = df['objectID'].astype(str) + ".png"
img_dir = "data/pointImages/ml"
image_filenames = set(os.listdir(img_dir))
df_exists = df[df['Media file Name'].isin(image_filenames)].copy()
counts = df_exists[class_type].value_counts().reset_index()
counts[f'{class_type}_new'] = counts.apply(
lambda row: row[class_type] if row['count'] > 9 else 'other', axis=1
)
counts = counts[[class_type, f'{class_type}_new']]
df_merged = pd.merge(df_exists, counts, on=class_type, how='left')
df_merged = df_merged.drop(columns=[class_type])
df_merged = df_merged.rename(columns={f'{class_type}_new': class_type})
df_merged = df_merged[~df_merged[class_type].isin(
['Unknown', 'other', 'Unclassified']
)].copy()
df_merged = df_merged.dropna(subset=[class_type]).copy()
train_df, test_df = train_test_split(
df_merged, test_size=0.15,
stratify=df_merged[class_type], random_state=42
)
# Define the number of rotations for each image to expand the dataset
ROTATION_ANGLES = [0] # You can adjust rotation angles if needed
# Create a custom dataset for images and labels with augmentations
class AugmentedPointClusterDataset(Dataset):
def __init__(self, df, img_dir, transform=None,
rotation_angles=ROTATION_ANGLES):
self.df = df
self.img_dir = img_dir
self.transform = transform
self.rotation_angles = rotation_angles
self.label_to_idx = {
label: idx for idx, label in enumerate(include_types)}
self.idx_to_label = {
idx: label for label, idx in self.label_to_idx.items()}
def __len__(self):
return len(self.df) * len(self.rotation_angles)
def __getitem__(self, idx):
original_idx = idx // len(self.rotation_angles)
rotation_angle = self.rotation_angles[idx % len(
self.rotation_angles
)]
img_name = self.df.iloc[original_idx]["Media file Name"]
img_path = os.path.join(self.img_dir, img_name)
image = Image.open(img_path).convert("RGB")
image = image.rotate(rotation_angle)
if self.transform:
image = self.transform(image)
label = self.df.iloc[original_idx][class_type]
label_idx = self.label_to_idx[label]
return image, label_idx
# Define transformations
transform = transforms.Compose([
transforms.Resize((224, 224)),
transforms.RandomHorizontalFlip(),
transforms.ToTensor(),
transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[
0.229, 0.224, 0.225]),
])
# Create Dataset and DataLoader with augmentations
train_dataset = AugmentedPointClusterDataset(
train_df, img_dir, transform=transform)
test_dataset = AugmentedPointClusterDataset(
test_df, img_dir, transform=transform)
train_loader = DataLoader(
train_dataset, batch_size=4, shuffle=True)
test_loader = DataLoader(
test_dataset, batch_size=4, shuffle=False)
# Load the EfficientNetV2 model and adjust the classification head
model = efficientnet_v2_s(pretrained=True)
num_classes = len(include_types)
# print(model.classifier)
model.classifier[1] = nn.Linear(
model.classifier[1].in_features, num_classes) # type: ignore
# Set up loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=1e-4) # type: ignore
# Training loop
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
print(device)
epochs = 12
form_string = "_".join(form)
log_file_path = f"models/class_{class_type}_{form_string}_{version}.txt"
for epoch in range(epochs):
start_time = time.time()
with contextlib.redirect_stdout(open(os.devnull, 'w')):
model.train()
running_loss = 0.0
correct = 0
total = 0
for images, labels in train_loader:
images, labels = images.to(device), labels.to(device)
optimizer.zero_grad()
outputs = model(images)
loss = criterion(outputs, labels)
loss.backward()
optimizer.step()
running_loss += loss.item()
_, predicted = outputs.max(1)
total += labels.size(0)
correct += predicted.eq(labels).sum().item()
epoch_loss = running_loss / len(train_loader)
epoch_accuracy = 100 * correct / total
elapsed_time = time.time() - start_time
print(f"""
Epoch {epoch + 1}/{epochs},
Loss: {epoch_loss:.4f},
Accuracy: {epoch_accuracy:.2f}%,
Time: {elapsed_time:.2f}
seconds
""")
# Save the model for the current epoch
model_save_path = f"""
models/class_{class_type}_{form_string}_epoch_{epoch + 1}_{version}.pth
"""
torch.save(model.state_dict(), model_save_path)
print(f"Model saved to {model_save_path}")
# Log training metrics to file
with open(
log_file_path, 'a'
) as log_file:
log_file.write(
f"""
Epoch {epoch + 1}/{epochs},
Loss: {epoch_loss:.4f},
Accuracy: {epoch_accuracy:.2f}%\n
"""
)
# Evaluate on the test set and log test metrics
test_loss, test_accuracy = test_model(
model, test_loader, device, criterion)
with open(log_file_path, 'a') as log_file:
log_file.write(
f"""
Test Loss: {test_loss:.4f},
Test Accuracy: {test_accuracy:.2f}%\n
"""
)
# usage
data_path = "data/heurist_projectilePoints.csv"
df = pd.read_csv(data_path)
df.columns
# df['Point Form'].unique()
# df['Notch Height'].unique()
# class_type = 'Notch Height'
# form = ['side-notched']
# include_types = ["low","mid","high"]
# train_and_save_model(class_type=class_type, form=form,
# include_types=include_types, version="v1")
# models\class_Notch Height_side-notched_epoch_12_v1.pth
df['Base Type'].unique()
class_type = 'Base Type'
form = ['side-notched', 'triangular']
include_types = ['straight', 'concave', 'deeply concave', 'convex']
train_and_save_model(class_type=class_type, form=form,
include_types=include_types, version="v2")
# models\class_Base Type_side-notched_triangular_epoch_12_v12.pth
# class_type = 'Point Cluster'
# df[class_type].unique()
# form = ['side-notched','triangular']
# include_types = ['shape-2', 'shape-3', 'shape-1']
# train_and_save_model(class_type=class_type, form=form,
# include_types=include_types, version="v1")
# class_Point Cluster_side-notched_triangular_epoch_9_v1.pth
# class_type = 'Serrated'
# df[class_type].unique()
# form = ['side-notched','triangular']
# include_types = ['No', 'Yes']
# train_and_save_model(class_type=class_type, form=form,
# include_types=include_types, version="v1")
# models\class_Serrated_side-notched_triangular_epoch_12_v1.pthPython Evaluation Code
import os
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms
from PIL import Image
from torchvision.models import efficientnet_v2_s
import torch.nn.functional as F
from tqdm import tqdm
def predict(image_path, model, transform, labels, top_k=5, verbose=False):
# Automatically adjust top_k based on the number of classes
num_classes = len(labels)
top_k = min(top_k, num_classes)
model.eval() # Set the model to evaluation mode
image = Image.open(image_path).convert("RGB")
image = transform(image).unsqueeze(0).to(device)
with torch.no_grad(): # Disable gradient computation for inference
outputs = model(image) # Get the raw logits from the model
# Apply softmax to get the probabilities
probabilities = F.softmax(outputs, dim=1)
# Get the top k probabilities and their corresponding indices
top_probs, top_idxs = probabilities.topk(top_k, dim=1)
# Convert the top probabilities and indices to lists
top_probs = top_probs.cpu().numpy().flatten()
top_idxs = top_idxs.cpu().numpy().flatten()
# Print the top k predictions with probabilities
if verbose:
print("Top predictions:")
for i in range(top_k):
label = labels[top_idxs[i]] # Get the label based on the index
prob = top_probs[i]
print(f"{i+1}: {label} ({prob:.4f})")
predicted_idx = top_idxs[0]
# Get the label from the labels list
predicted_label = labels[predicted_idx]
predicted_prob = top_probs[0]
return predicted_label, predicted_prob
# Define class_type, form, and include_types for evaluation
class_type = 'Point Cluster'
form = ['side-notched', 'triangular']
include_types = ['shape-2', 'shape-3', 'shape-1']
# Define transformations
transform = transforms.Compose([
transforms.Resize((224, 224)), # Resize to 224x224 for EfficientNet
transforms.RandomHorizontalFlip(), # Randomly flip the image horizontally
transforms.ToTensor(),
transforms.Normalize(mean=[0.485, 0.456, 0.406],
std=[0.229, 0.224, 0.225]),
])
labels = include_types
# Load the EfficientNetV2 model and adjust the classification head
model = efficientnet_v2_s(pretrained=True)
model.classifier[1] = nn.Linear(
model.classifier[1].in_features, len(labels)
) # type: ignore
# Load the saved model weights
model_save_path = """
models/class_Base Type_side-notched_triangular_epoch_12_v2.pth
"""
model.load_state_dict(torch.load(model_save_path))
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
print(device)
# Define image directory and data path
img_dir = "data/pointImages/ml"
# Get a set of all image filenames in img_dir
image_filenames = set(os.listdir(img_dir))
if isinstance(form, str):
form = [form]
data_path = "data/heurist_projectilePoints.csv"
df = pd.read_csv(data_path)
df['Media file Name'] = df['objectID'].astype(str) + ".png"
df_exists = df[df['Media file Name'].isin(image_filenames)].copy()
len(df_exists)
df_exists[class_type].unique()
df_unknown = df_exists
# df_unknown = df_exists[pd.isna(df_exists[class_type])]
# df_unknown = df_unknown[df_unknown['Point Form'].isin(form)].copy()
# df_unknown = df_unknown[['Projectile point H-ID',
# 'objectID', class_type, 'Media file Name', 'Point Form']]
# df_unknown['Predicted Type'] = None
# df_unknown['Prediction Probability'] = None
len(df_unknown)
# Predict and update df_unknown DataFrame
for image_name in tqdm(df_unknown['Media file Name'],
desc="Processing images"):
if image_name.lower().endswith(".png"):
image_path = os.path.join(img_dir, image_name)
# Use the predict function to get
# the predicted class and probability
predicted_cluster, predicted_prob = predict(
image_path, model, transform, labels, top_k=5, verbose=False
)
# Set predicted_cluster to
# "Unclassified" if predicted_prob is below 0.5
# predicted_cluster =
# "Unclassified" if predicted_prob < 0.5 else predicted_cluster
# Update the df_unknown DataFrame with the predicted type and probability
df_unknown.loc[df_unknown['Media file Name'] == image_name,
'Predicted Type'] = predicted_cluster
df_unknown.loc[df_unknown['Media file Name'] == image_name,
'Prediction Probability'] = predicted_prob
# Save the updated df_unknown DataFrame to an Excel file
output_tsv_path = "data/predicted_types.tsv"
df_unknown.to_csv(output_tsv_path, sep='\t', index=False)
print(f"Results saved to {output_tsv_path}")Python Environment
Note: this python environment was also used for all python code used in this dissertation.
python==3.8.19
aiofiles==23.2.1
aiohappyeyeballs==2.4.3
aiohttp==3.10.11
aiosignal==1.3.1
annotated-types==0.7.0
anyio==4.4.0
asgiref==3.8.1
asttokens==3.0.0
async-timeout==5.0.1
asyncer==0.0.8
attrs==24.2.0
backcall==0.2.0
backports.zoneinfo==0.2.1
Bottleneck
Brotli
build==1.2.2
cachetools==5.5.0
certifi
cffi==1.17.1
charset-normalizer
click==8.1.7
colorama==0.4.6
coloredlogs==15.0.1
comm==0.2.2
contourpy==1.1.1
cryptography==43.0.1
cycler==0.12.1
debugpy==1.8.13
decorator==5.2.1
distro==1.9.0
Django==4.2.16
efficientnet_pytorch==0.7.1
et-xmlfile==1.1.0
exceptiongroup==1.2.2
executing==2.2.0
fancycompleter==0.9.1
fastapi==0.115.5
ffmpy==0.4.0
filelock
filetype==1.2.0
flatbuffers==24.3.25
fonttools==4.53.1
frozenlist==1.5.0
fsspec==2024.9.0
gmpy2
google-ai-generativelanguage==0.1.0
google-api-core==2.19.2
google-auth==2.34.0
google-generativeai==0.1.0rc1
googleapis-common-protos==1.65.0
gradio==4.44.1
gradio_client==1.3.0
grpcio==1.66.1
grpcio-status==1.62.3
h11==0.14.0
httpcore==1.0.5
httpx==0.27.2
huggingface-hub==0.25.0
humanfriendly==10.0
idna
imageio==2.35.1
importlib_metadata==8.5.0
importlib_resources==6.4.4
ipykernel==6.29.5
ipython==8.12.3
jedi==0.19.2
Jinja2
jiter==0.5.0
joblib==1.4.2
jsonschema==4.23.0
jsonschema-specifications==2023.12.1
jupyter_client==8.6.3
jupyter_core==5.7.2
kiwisolver==1.4.5
lazy_loader==0.4
llvmlite==0.41.1
Markdown==3.7
markdown-it-py==3.0.0
MarkupSafe
matplotlib==3.7.5
matplotlib-inline==0.1.7
mdurl==0.1.2
mkl-fft
mkl-random
mkl-service==2.4.0
mpmath
multidict==6.1.0
munch==4.0.0
nest-asyncio==1.6.0
networkx
numba==0.58.1
numexpr
numpy
onnxruntime==1.19.2
onnxruntime-gpu==1.19.2
openai==1.45.1
opencv-python-headless==4.10.0.84
openpyxl==3.1.5
orjson==3.10.11
packaging==24.2
pandas
parso==0.8.4
pdbpp==0.10.3
pdf2image==1.17.0
pdfminer.six==20231228
pdfplumber==0.11.4
pickleshare==0.7.5
pillow
pip-tools==7.4.1
pkgutil_resolve_name==1.3.10
platformdirs==4.3.6
pooch==1.8.2
pretrainedmodels==0.7.4
prompt_toolkit==3.0.50
propcache==0.2.0
proto-plus==1.24.0
protobuf==4.25.4
psutil==7.0.0
pure_eval==0.2.3
pyarrow==17.0.0
pyasn1==0.6.1
pyasn1_modules==0.4.1
pycparser==2.22
pydantic==2.9.1
pydantic_core==2.23.3
pydub==0.25.1
Pygments==2.19.1
PyMatting==1.1.13
pynndescent==0.5.13
pypandoc==1.13
pyparsing==3.1.2
PyPDF2==3.0.1
pypdfium2==4.30.0
pyproject_hooks==1.2.0
pyreadline==2.1
pyreadline3==3.5.4
pyrepl==0.9.0
PySocks
python-dateutil==2.9.0.post0
python-dotenv==1.0.1
python-multipart==0.0.17
pyTRS
pytz
PyWavelets==1.4.1
pywin32==310
PyYAML
pyzmq==26.3.0
referencing==0.35.1
regex==2024.9.11
rembg==2.0.59
requests
rpds-py==0.20.1
rsa==4.9
ruff==0.7.4
safetensors==0.4.5
scikit-image==0.21.0
scikit-learn==1.3.2
scipy==1.10.1
segmentation-models-pytorch==0.3.3
semantic-version==2.10.0
shapely==2.0.6
shellingham==1.5.4
six==1.17.0
sniffio==1.3.1
sqlparse==0.5.1
stack-data==0.6.3
starlette==0.41.2
sympy
threadpoolctl==3.5.0
tifffile==2023.7.10
tiktoken==0.7.0
timm==0.9.2
tomli==2.0.1
tomlkit==0.12.0
torch==2.4.0
torchaudio==2.4.0
torchvision==0.19.0
tornado==6.4.2
tqdm==4.66.5
traitlets==5.14.3
typer==0.13.0
typing_extensions==4.12.2
tzdata
umap-learn==0.5.6
urllib3
uunet==2.2.1
uvicorn==0.32.0
watchdog==4.0.2
wcwidth==0.2.13
websockets==12.0
win-inet-pton
wmctrl==0.5
yarl==1.15.2
zipp==3.20.2