Appendix B — Code and Environments for Chapter 3

This chapter contains the code used for the analysis in chapter 3. The code is organized into six sections: R, R Environment, Python Training Code, Python Evaluation Code, and Python Environment. The R section contains the code for the analysis, the R Environment section contains the R environment used to run the analysis, the Python Training Code section contains the code used to train the model, the Python Evaluation Codei section contains the code to evaluate the model, and the Python Environment section contains the Python environment used to run the analysis.

R Code

library(tidyverse)
library(here)
library(magick)
library(sf)
library(Momocs)
library(umap)
library(randomForest)
library(SpatialKDE)
library(terra)
library(ggrepel)
library(ggnewscale)
library(ggfx)
library(colorspace)
library(viridis)
library(rnaturalearth)

run_hotellings <- function(groups, category, data) {
  # Load necessary library
  library(Hotelling)

  vars <- names(data)
  vars <- vars[!vars %in% c(category)]

  # Initialize the results dataframe
  results <- data.frame(
    group1 = character(),
    group2 = character(),
    n_group1 = integer(),
    n_group2 = integer(),
    T_squared = numeric(),
    p_value = numeric(),
    stringsAsFactors = FALSE
  )

  # Loop over each pair of groups
  for (i in 1:(length(groups) - 1)) {
    for (j in (i + 1):length(groups)) {
      group1 <- groups[i]
      group2 <- groups[j]
      if (group1 == group2) next

      # Subset the data for the two groups
      subset_data <- data[data[[category]] %in% c(group1, group2), ]

      # Split the data into the two groups for Hotelling's T² test
      group1_data <- subset_data[subset_data[[category]] == group1, vars]
      group2_data <- subset_data[subset_data[[category]] == group2, vars]

      # Calculate the sample size for each group
      n_group1 <- nrow(group1_data)
      n_group2 <- nrow(group2_data)

      # Debugging: print the group names and sizes to verify
      cat(
        "Comparing", group1, "vs", group2, "- n_group1:",
        n_group1, "n_group2:", n_group2, "\n"
      )

      # Check if there are enough samples for Hotelling's T²
      if (n_group1 < 5 || n_group2 < 5) {
        warning(paste(
          "Skipping comparison for", group1, "vs", group2,
          "due to insufficient data"
        ))
        next
      }

      # Perform Hotelling's T² test
      result <- tryCatch(
        {
          hotelling_result <- hotelling.test(group1_data, group2_data)
          hotelling_result
        },
        error = function(e) {
          warning(paste(
            "Error in Hotelling's T² test for", group1, "vs",
            group2, ":", e$message
          ))
          return(NULL)
        }
      )

      if (!is.null(result)) {
        # Append results to the dataframe
        results <- rbind(
          results,
          data.frame(
            group1 = group1,
            group2 = group2,
            n_group1 = n_group1,
            n_group2 = n_group2,
            T_squared = result$stats$statistic,
            p_value = result$pval
          )
        )
      } else {
        # Debugging: indicate that the result is NULL for this comparison
        cat("No result for", group1, "vs", group2, "\n")
      }
    }
  }

  # Ensure p_value is numeric and round to 3 decimal places
  results <- results %>%
    mutate(p_value = as.numeric(p_value) %>%
      round(3)) %>%
    tibble::as_tibble()

  # Check if results dataframe is empty and return a message if so
  if (nrow(results) == 0) {
    warning("No valid comparisons were made. Check your input data.")
  }

  return(results)
}

i_am("analysis/Chap3Analysis.R")

points <- rio::import("data/heurist_projectilePoints.csv") %>%
  janitor::clean_names() %>%
  as_tibble()
points %>% glimpse()

#### Type Collection ####

type_collection <- points %>%
  filter(map_lgl(
    project_name_title,
    ~ any(.x %in% c("Justice", "MNA Type Collection"))
  )) %>%
  unchop(project_name_title) %>%
  mutate(original_point_type = as.character(original_point_type)) %>%
  filter(!is.na(x1))

types <- table(type_collection$original_point_type) %>%
  as.data.frame() %>%
  filter(Freq > 5) %>%
  pull(Var1) %>%
  as.character()

landmarks <- points %>%
  select(x1:y8) %>%
  names()

plotdf <- type_collection %>%
  rowid_to_column() %>%
  filter(original_point_type %in% types) %>%
  select(rowid, all_of(landmarks)) %>%
  pivot_longer(any_of(starts_with("x")), names_to = "xl", values_to = "x") %>%
  select(-any_of(landmarks)) %>%
  bind_cols(type_collection %>%
    rowid_to_column() %>%
    filter(original_point_type %in% types) %>%
    select(rowid, all_of(landmarks)) %>%
    pivot_longer(any_of(starts_with("y")),
      names_to = "yl", values_to = "y"
    ) %>%
    select(-any_of(landmarks), -rowid)) %>%
  left_join(type_collection %>% select(-any_of(landmarks)) %>%
    rowid_to_column() %>% select(rowid, original_point_type, point_form))

plotside <- plotdf %>%
  filter(point_form == "side-notched") %>%
  filter(original_point_type != "Cottonwood Triangular")
mean_df <- plotside %>%
  filter(original_point_type != "Cottonwood Triangular") %>%
  select(-rowid) %>%
  group_by(original_point_type, xl, yl) %>%
  summarise_at(vars(x, y), mean, na.rm = T) %>%
  mutate(original_point_type = str_remove_all(
    original_point_type,
    "Side-notched|Side Notched"
  ) %>% str_replace_all("  ", " ") %>%
    as.character())

plotside %>%
  mutate(original_point_type = str_remove_all(
    original_point_type,
    "Side-notched|Side Notched"
  ) %>% str_replace_all("  ", " ") %>%
    as.character()) %>%
  ggplot(aes(x, y, fill = original_point_type)) +
  geom_point(size = 1, shape = 21, color = "white", alpha = .5) +
  geom_point(
    data = mean_df, aes(
      x = x, y = y,
      fill = original_point_type
    ),
    shape = 21, size = 4, alpha = .8, color = "black"
  ) +
  scale_fill_viridis_d() +
  theme_bw() +
  coord_fixed() +
  guides(fill = "none") +
  facet_wrap(~original_point_type) +
  theme(
    strip.background = element_rect(fill = "white"),
    strip.text = element_text(size = 6)
  )

nm <- "mean_landmarks_side"

# ggsave(file.path("figures",paste0(nm,".png")),
#  dpi = 900, width = 170, units = "mm")

# file.path("figures",paste0(nm,".png")) %>% image_read() %>%
#  image_trim() %>% image_border(geometry = "50x50", color = "white") %>%
#    image_write(file.path("figures",paste0(nm,".png")))

plotmeanall <- plotside %>%
  group_by(xl, yl) %>%
  summarize_all(mean) %>%
  janitor::remove_constant() %>%
  mutate(label = str_remove_all(xl, "x"))

# magick::image_read(fn) %>% magick::image_trim() %>%
#  magick::image_write(fn, density = 900)

# Landmark example #

point <- read_csv("data/meanpointexample.csv", show_col_types = F)

ggplot() +
  geom_polygon(data = point, aes(x, y), color = "black", size = 1) +
  geom_point(
    data = plotmeanall, aes(x, y), shape = 16, color = "red",
    alpha = .75, size = 8
  ) +
  geom_text(
    data = plotmeanall, aes(x, y, label = label), color = "white",
    size = 4
  ) +
  theme_void() +
  coord_equal()

# fn = "figures/landmarks_example.png"
# ggsave(fn,dpi = 900, width = 4, height = 5)

# triangular

plottri <- plotdf %>%
  filter(point_form == "triangular") %>%
  mutate(original_point_type = str_remove_all(
    original_point_type, "Triangular"
  ) %>%
    str_replace_all("  ", " ") %>% as.character())
mean_df <- plottri %>%
  select(-rowid) %>%
  group_by(original_point_type, xl, yl) %>%
  summarise_at(vars(x, y), mean, na.rm = T)

plottri %>%
  ggplot(aes(x, y, fill = original_point_type)) +
  geom_point(size = 1, shape = 21, color = "white", alpha = .5) +
  geom_point(
    data = mean_df, aes(x = x, y = y, fill = original_point_type),
    shape = 23, size = 4, alpha = .8, color = "black"
  ) +
  scale_fill_viridis_d() +
  theme_bw() +
  coord_equal() +
  guides(fill = "none") +
  facet_wrap(~original_point_type) +
  theme(
    strip.background = element_rect(fill = "white"),
    strip.text = element_text(size = 7.5)
  )

nm <- "mean_landmarks_tri"

# ggsave(file.path("figures",paste0(nm,".png")),
#  dpi = 900, width = 170, units = "mm")

# file.path("figures",paste0(nm,".png")) %>%
# image_read() %>% image_trim() %>%
#  image_border(geometry = "50x50", color = "white") %>%
#    image_write(file.path("figures",paste0(nm,".png")))

# Examine side-notched types

type_side <- type_collection %>%
  filter(point_form == "side-notched") %>%
  filter(original_point_type != "Cottonwood Triangular")
types <- table(type_side$original_point_type) %>%
  as.data.frame() %>%
  filter(Freq > 5) %>%
  pull(Var1) %>%
  as.character()
type_side <- type_side %>%
  filter(original_point_type %in% types)
u <- umap(type_side %>% select(any_of(landmarks)))
u_df <- u$layout %>%
  as.data.frame() %>%
  cbind(type_side %>%
    select(original_point_type)) %>%
  as_tibble() %>%
  mutate(original_point_type = original_point_type %>% factor()) %>%
  rename(`point type` = original_point_type, x = V1, y = V2)

u_df_sub <- u_df %>%
  filter(`point type` %in% c("Awatovi", "Bonito", "Buck Taylor"))

# Create a custom legend for both color and shape by combining the aesthetics
ggplot(u_df, aes(y, x, color = `point type`, shape = `point type`)) +
  geom_point(size = 3) +
  scale_color_viridis_d() +
  scale_shape_manual(values = c(15, 17, 18, 19, 19, 19, 19, 19, 19, 19)) +
  stat_ellipse(
    data = u_df_sub, aes(fill = `point type`),
    geom = "polygon", level = 0.68, alpha = .1
  ) +
  theme_bw() +
  coord_equal() +
  guides(
    fill = guide_none(),
    color = guide_legend(ncol = 3)
  ) +
  theme(
    legend.position = "bottom",
    legend.title = element_blank(),
    legend.text = element_text(size = 8),
    legend.key.size = unit(0.4, "cm"),
    legend.margin = ggplot2::margin(t = -20, 0, 0, 0)
  ) +
  xlab("") +
  ylab("")

nm <- "umap_side_original"

# ggsave(file.path("figures",paste0(nm,".png")),
#  dpi = 900, width = 170, units = "mm")
#
# file.path("figures",paste0(nm,".png")) %>%
# image_read() %>% image_trim() %>%
#  image_border(geometry = "50x50", color = "white") %>%
#   image_write(file.path("figures",paste0(nm,".png")))
#
# browseURL(file.path("figures",paste0(nm,".png")))

groups <- levels(u_df$`point type`)
category <- "point type"
results_hot_side_notch_original_types <- run_hotellings(groups, category, u_df)
print(results_hot_side_notch_original_types %>% select(group1, group2, p_value))

tbl1 <- results_hot_side_notch_original_types %>%
  filter(p_value > 0.1) %>%
  mutate(form = "side-notched")
names(tbl1) <- c(
  "Type 1", "Type 2", "Type 1 Total",
  "Type 2 Total", "T-squared", "P-value", "Form"
)

# Examine triangular types
type_tri <- type_collection %>%
  filter(point_form == "triangular")
types <- table(type_tri$original_point_type) %>%
  as.data.frame() %>%
  filter(Freq > 5) %>%
  pull(Var1) %>%
  as.character()
type_tri <- type_tri %>%
  filter(original_point_type %in% types)
landmarks <- type_tri %>%
  select(x1:y5) %>%
  names()
u <- umap(type_tri %>% select(any_of(landmarks)))
u_df <- u$layout %>%
  as.data.frame() %>%
  cbind(type_tri %>%
    select(original_point_type)) %>%
  as_tibble() %>%
  mutate(original_point_type = original_point_type %>% factor()) %>%
  rename(`point type` = original_point_type, x = V1, y = V2)

ggplot(u_df, aes(x, y, color = `point type`, shape = `point type`)) +
  geom_point(size = 3) +
  scale_shape_manual(values = c(19, 17, 19, 15, 15, 18)) +
  stat_ellipse(aes(fill = `point type`),
    geom = "polygon",
    level = 0.68, alpha = .1
  ) +
  theme_bw() +
  scale_color_viridis_d() +
  coord_equal() +
  theme(
    legend.position = "bottom",
    legend.title = element_blank(),
    legend.margin = ggplot2::margin(-20, 0, 0, 0, unit = "pt")
  ) +
  guides(color = guide_legend(ncol = 3)) +
  xlab("") +
  ylab("")

nm <- "umap_tri_original"

# ggsave(file.path("figures",paste0(nm,".png")),
#  dpi = 900, width = 170, units = "mm")
#
# file.path("figures",paste0(nm,".png")) %>%
#  image_read() %>% image_trim() %>%
#   image_border(geometry = "50x50", color = "white") %>%
#    image_write(file.path("figures",paste0(nm,".png")))
#
# browseURL(file.path("figures",paste0(nm,".png")))

groups <- levels(u_df$`point type`)
category <- "point type"
results_triangular_original_types <- run_hotellings(groups, category, u_df)
print(results_triangular_original_types %>% select(group1, group2, p_value))

tbl2 <- results_triangular_original_types %>%
  filter(p_value > 0.1) %>%
  mutate(form = "triangular")
names(tbl2) <- c(
  "Type 1", "Type 2",
  "Type 1 Total",
  "Type 2 Total",
  "T-squared",
  "P-value",
  "Form"
)
# write_csv(bind_rows(tbl1,tbl2),
# "tables/hotelling_test_results_original_types.csv")

#### Cluster Analysis of All Landmarks ####

# landmark shape clusters
points_sub <- points %>%
  filter(point_form %in% c("side-notched", "triangular")) %>%
  select(object_id, point_form, x1:y5) %>%
  filter(!is.na(y5))

points_out <- lapply(1:nrow(points_sub), function(i) {
  # Extract the relevant x, y columns for row i
  row <- points_sub[i, c(
    "x1", "y1", "x2", "y2", "x3", "y3", "x4", "y4",
    "x5", "y5"
  )]

  # Convert to numeric and reshape into an 8x2 matrix (x, y pairs)
  matrix(as.numeric(unlist(row)), ncol = 2, byrow = TRUE)
})

# project triangular and side-notched points into the same morphospace
points_out <- Out(points_out) %>%
  Ldk() %>%
  fgProcrustes()

points_out_df <- as_df(points_out)

u <- umap(points_out_df)
u_df <- u$layout %>%
  as.data.frame() %>%
  cbind(points_sub %>% select(object_id, point_form)) %>%
  as_tibble()
ggplot(u_df, aes(V1, V2, color = point_form)) +
  geom_point()

hc <- hclust(dist(u$layout, method = "minkowski"), method = "ward.D2")

plot(hc)
clusters <- cutree(hc, k = 3)
clusters <- kmeans(u$layout, centers = 3, nstart = 3)$cluster
u_df$cluster <- factor(clusters)
ggplot(u_df, aes(V1, V2, color = cluster)) +
  geom_point() +
  scale_color_viridis_d() +
  coord_equal() +
  theme_bw() +
  xlab("") +
  ylab("") +
  theme(
    legend.position = "bottom",
    legend.margin = ggplot2::margin(-20, 0, 0, 0, unit = "pt")
  )

nm <- "umap_landmark_cluster"

# ggsave(file.path("figures",paste0(nm,".png")),
#  dpi = 900, width = 150, units = "mm")
#
# file.path("figures",paste0(nm,".png")) %>%
#  image_read() %>%
#  image_trim() %>%
#  image_border(geometry = "50x50", color = "white") %>%
#    image_write(file.path("figures",paste0(nm,".png")))

#### EFA Cluster analysis ####

k <- 3

rect_hclust_filled <- function(tree, k = NULL, which = NULL, x = NULL, h = NULL,
                               border = 2, cluster = NULL, alpha = 1) {
  if (length(h) > 1L || length(k) > 1L) {
    stop("'k' and 'h' must be a scalar")
  }
  if (!is.null(h)) {
    if (!is.null(k)) {
      stop("specify exactly one of 'k' and 'h'")
    }
    k <- min(which(rev(tree$height) < h))
    k <- max(k, 2)
  } else if (is.null(k)) {
    stop("specify exactly one of 'k' and 'h'")
  }
  if (k < 2 || k > length(tree$height)) {
    stop(gettextf("k must be between 2 and %d", length(tree$height)), domain = NA)
  }

  if (is.null(cluster)) {
    cluster <- cutree(tree, k = k)
  }

  clustab <- table(cluster)[unique(cluster[tree$order])]
  m <- c(0, cumsum(clustab))

  if (!is.null(x)) {
    if (!is.null(which)) {
      stop("specify exactly one of 'which' and 'x'")
    }
    which <- x
    for (n in seq_along(x)) which[n] <- max(which(m < x[n]))
  } else if (is.null(which)) {
    which <- 1L:k
  }

  if (any(which > k)) {
    stop(gettextf("all elements of 'which' must be between 1 and %d", k), domain = NA)
  }

  border <- rep_len(border, length(which))
  retval <- list()

  for (n in seq_along(which)) {
    fill_col <- adjustcolor(border[n], alpha.f = alpha)
    rect(
      m[which[n]] + 0.66,
      par("usr")[3L],
      m[which[n] + 1] + 0.33,
      mean(rev(tree$height)[(k - 1):k]),
      border = border[n],
      col = fill_col
    )
    retval[[n]] <- which(cluster == as.integer(names(clustab)[which[n]]))
  }

  invisible(retval)
}


out <- readRDS(here("data/point_outlines.Rds"))

efa_out <- out %>%
  fgProcrustes(tol = 0.1) %>%
  Momocs::efourier()
pca_out <- efa_out %>% Momocs::PCA()

u <- umap(efa_out$coe)
u_df <- u$layout %>%
  as.data.frame() %>%
  rownames_to_column("object_id") %>%
  as_tibble()

hc <- hclust(dist(u$layout, method = "minkowski"), method = "ward.D2")

# fn = file.path("figures","hclust.png")
# png(fn, width = 200, height = 100, res = 900, units = "mm")
# plot(
#   hc,
#   labels = FALSE,
#   hang = -1,
#   xlab = "",
#   ylab = "",
#   main = "",
#   sub = "",
#   yaxt = "n",
#   cex.main = 1.5,
#   cex.axis = 1.2
# )
# rect_hclust_filled(hc, k = k, border = c("#1b9e77", "#d95f02", "#7570b3"), alpha = .2)
# dev.off()
# image_read(fn) %>%
#   image_trim() %>%
#   image_border(geometry = "50x50", color = "white") %>%
#   image_write(fn)
# browseURL(fn)

clusters <- cutree(hc, k = k)
u_df$cluster <- factor(clusters)
out$fac$cluster <- factor(clusters)
# mean shapes of clusters
out_split <- chop(out, fac = "cluster")

# Compute mean shapes for each category
mean_shapes_list <- lapply(out_split, function(coo_object) {
  MSHAPES(coo_object$coo)
})
mean_shapes_list <- Momocs::combine(mean_shapes_list %>% Out())
mean_shapes_list$fac <- tibble(cluster = names(mean_shapes_list))

a <- coo_length(mean_shapes_list) / min(coo_length(mean_shapes_list))
mean_shapes_list <- coo_scale(mean_shapes_list, a)
stack(mean_shapes_list)

mean_shapes_df <- as_df(mean_shapes_list) %>%
  mutate(coo = map(coo, as_tibble)) %>%
  unnest(coo) %>%
  group_split(cluster)

getbbox <- function(x, y, q = .75) {
  q1 <- quantile(x, 1 - q)
  q2 <- quantile(x, q)
  q3 <- quantile(y, 1 - q)
  q4 <- quantile(y, q)
  xt <- x[x > q1 & x < q2]
  yt <- y[y > q3 & y < q4]
  return(c(mean(xt), mean(yt)))
}

u_df_group <- u_df %>%
  group_split(cluster)

coords <- lapply(1:length(u_df_group), function(i) {
  tmp <- u_df_group[[i]]
  getbbox(tmp$V1, tmp$V2)
})

p <- list()

for (i in 1:length(mean_shapes_df)) {
  tmp <- mean_shapes_df[[i]] %>%
    ggplot(aes(x = V1, y = V2, group = cluster)) +
    geom_polygon(fill = "#404040") +
    theme_void() +
    coord_equal()
  tmp <- ggplotGrob(tmp)
  p[[i]] <- tmp
}

library(grid)
library(cowplot)

u_df %>%
  rename(x = V1, y = V2) %>%
  ggplot(aes(x = y, y = x, color = cluster)) +
  annotation_custom(
    grob = p[[1]],
    ymin = coords[[1]][1] - .75,
    ymax = coords[[1]][1] + .75,
    xmin = coords[[1]][2] - 1.5,
    xmax = coords[[1]][2] + 1.5
  ) +
  annotation_custom(
    grob = p[[2]],
    ymin = coords[[2]][1] - .75,
    ymax = coords[[2]][1] + .75,
    xmin = coords[[2]][2] - 1.5,
    xmax = coords[[2]][2] + 1.5
  ) +
  annotation_custom(
    grob = p[[3]],
    ymin = coords[[3]][1] - .75,
    ymax = coords[[3]][1] + .75,
    xmin = coords[[3]][2] - 1.5,
    xmax = coords[[3]][2] + 1.5
  ) +
  geom_point(size = .75, alpha = 0.66) +
  stat_ellipse(aes(fill = cluster), geom = "polygon", level = 0.68, alpha = .1) +
  theme_bw() +
  scale_color_viridis_d() +
  coord_equal() +
  # remove x and y labels
  theme(
    axis.title.x = element_blank(),
    axis.title.y = element_blank(),
    axis.ticks = element_blank(),
    legend.margin = ggplot2::margin(-10, 0, 0, 0, unit = "pt"),
  ) +
  theme(legend.position = "bottom")

nm <- "cluster_shapes_all"

# ggsave(file.path("figures",paste0(nm,".png")),
# dpi = 900, width = 170, height = 100, units = "mm")
# file.path("figures",paste0(nm,".png")) %>%
# image_read() %>% image_trim() %>%
# image_border(geometry = "50x50", color = "white") %>%
# image_write(file.path("figures",paste0(nm,".png")))
# browseURL(file.path("figures",paste0(nm,".png")))


# check cluster assignments
tst <- manova(cbind(u_df$V1, u_df$V2) ~ cluster, data = u_df)
summary(tst, test = "Wilks")

run_hotellings(levels(u_df$cluster), "cluster", u_df[, c("cluster", "V1", "V2")])

points_update <- points %>%
  select(`H-ID` = projectile_point_h_id, object_id) %>%
  inner_join(u_df %>% select(object_id, cluster)) %>%
  rowwise() %>%
  mutate(cluster = paste0("shape-", cluster))

#### examine base shapes using landmarks ####


# calculate the angle between the base points y3 and y5
points <- points %>%
  rowwise() %>%
  mutate(base_slope = atan((y5 - y3) / (x5 - x3)) * 180 / pi)

points %>%
  pull(base_slope) %>%
  summary()
points %>%
  pull(base_slope) %>%
  hist()

points %>%
  filter(!is.na(base_slope)) %>%
  filter(!base_type %in% c("", "unknown")) %>%
  mutate(base_type = factor(base_type,
    levels = c("deeply concave", "concave", "straight", "convex")
  )) %>%
  ggplot(aes(x = base_slope, fill = base_type)) +
  geom_boxplot()

points %>%
  filter(!is.na(base_slope)) %>%
  filter(!base_type %in% c("", "unknown")) %>%
  group_by(base_type) %>%
  summarize(
    mean_slope = mean(base_slope, na.rm = T),
    quantile1 = quantile(base_slope, .25, na.rm = T),
    quantile3 = quantile(base_slope, .75, na.rm = T)
  )

points <- points %>%
  filter(basal_notch == "") %>%
  mutate(base_type_updated = case_when(
    base_slope < -18 ~ "deeply concave",
    base_slope < -5 ~ "concave",
    base_slope > 5 ~ "convex",
    TRUE ~ "straight"
  ))

points %>%
  filter(!is.na(base_slope)) %>%
  filter(!base_type %in% c("", "unknown")) %>%
  mutate(base_type_updated = factor(base_type_updated,
    levels = c("deeply concave", "concave", "straight", "convex")
  )) %>%
  ggplot(aes(x = base_slope, fill = base_type_updated)) +
  geom_boxplot()

diff <- points %>%
  filter(!is.na(base_slope)) %>%
  filter(!base_type %in% c("", "unknown")) %>%
  filter(base_type != base_type_updated) %>%
  select(object_id, base_type, base_type_updated, base_slope, basal_notch)

diff <- diff %>%
  ungroup() %>%
  slice(175:728) %>%
  filter(base_type_updated == "deeply concave")

# write_tsv(diff, "C://users/rjbischo/Downloads/base_type_diff.tsv", na = "")

#### Notch Heights ####

points_side <- points %>%
  filter(point_form == "side-notched") %>%
  select(object_id, point_type, x1:y8) %>%
  filter(!is.na(x6)) %>%
  rowwise() %>%
  mutate(
    notch_height_bottom =
      dist(matrix(c(x3, y3, x8, y8), ncol = 2, byrow = T)) %>% as.numeric(),
    length = dist(matrix(c(x1, y1, x3, y3), ncol = 2, byrow = T)) %>%
      as.numeric(),
    notch_ratio = length / notch_height_bottom
  ) %>%
  ungroup()

fn <- file.path("figures", "notch_ratio.png")

# png(fn, width = 150, height = 100, res = 900, units = "mm")
hist(points_side$notch_ratio, breaks = 100, xlab = "", main = "")

low <- quantile(points_side$notch_ratio, .025)
high <- quantile(points_side$notch_ratio, .67)

abline(v = low, col = "red", lwd = 2, lty = 2)
abline(v = high, col = "red", lwd = 2, lty = 2)
# dev.off()

# image_read(fn) %>% image_trim() %>%
#  image_border(geometry = "50x50", color = "white") %>%
# image_write(fn)


points_side <- points_side %>%
  mutate(
    notch_height = case_when(
      notch_ratio < low ~ "high",
      notch_ratio > high ~ "low",
      TRUE ~ "mid"
    ),
    point_type = paste0(point_type, "-", notch_height)
  ) %>%
  mutate(point_type = case_when(
    str_detect(point_type, "-high") ~ "side-notched-high", TRUE ~ point_type
  ))

table(points_side$notch_height)

#### Assigning Point Types ####

points <- points %>%
  rowwise() %>%
  mutate(
    b = ifelse(basal_notch == "Yes", "basal_notch", ""),
    s = ifelse(point_form == "side-notched", "", serrated),
    s = ifelse(point_form == "triangular" & s == "Yes", "serrated", ""),
    point_type_old = point_type
  ) %>%
  mutate(point_type = paste(
    c(point_cluster, base_type, point_form, b, notch_height, s),
    collapse = "_"
  )) %>%
  ungroup() %>%
  # select(-b,-s) %>%
  mutate(
    point_type = str_remove_all(point_type, "_NA"),
    point_type = str_remove_all(point_type, "^_|_$"),
    point_type = str_remove_all(point_type, "^_|_$"),
    point_type = str_replace_all(point_type, "__", "_"),
    point_type = str_remove_all(point_type, "^_|_$")
  ) %>%
  mutate(
    point_type = ifelse(!point_form %in% c("side-notched", "triangular"),
      "undetermined", point_type
    ),
    point_type = ifelse(point_cluster == "", "undetermined", point_type),
    point_type = ifelse(str_detect(point_type, "undetermined|unknown|archaic"),
      "undetermined", point_type
    )
  )

table(points$point_type)
length(points$point_type %>% unique())
points %>%
  filter(point_type_old != point_type) %>%
  select(`H-ID` = projectile_point_h_id, object_id, point_type) %>%
  write_tsv(file.path("data/point_types.tsv"), na = "")

valid_types <- points %>%
  filter(point_type != "undetermined") %>%
  group_by(point_form, point_type) %>%
  count()

table(valid_types$point_form)

valid_types %>%
  filter(n == 1) %>%
  pull(point_type) %>%
  paste(collapse = "\n") %>%
  cat()

valid_types %>%
  ungroup() %>%
  filter(point_form == "side-notched") %>%
  pull(n) %>%
  summary()

valid_types %>%
  ungroup() %>%
  filter(point_form == "triangular") %>%
  pull(n) %>%
  summary()

tmp <- points %>%
  filter(
    original_point_type %in% c(
      "Awatovi",
      "Bonito",
      "Citrus Side-notched",
      "Cohonino Triangular",
      "Cottonwood Triangular",
      "Desert Side Notched",
      "Snaketown Triangular Concave Base",
      "Snaketown Triangular Straight-base",
      "Sobaipuri",
      "Temporal"
    )
  )
m <- table(tmp$point_type, tmp$original_point_type) %>% as.matrix()
m[m < 3] <- 0
m <- m[rowSums(m != 0) > 0, ]
m <- m %>% t()
m <- m[rowSums(m != 0) > 0, ]
fn <- "figures/original_point_type_matrix.png"
# png(fn, width = 6, height = 6, res = 900, units = "in")
# pheatmap::pheatmap(m, display_numbers = T,
#  number_format = "%.0f", legend = F, angle_col = 315)
# dev.off()

# # trim white space
# image_read(fn) %>% image_trim() %>%
# image_border(geometry = "50x50", color = "white") %>%
#   image_write(fn)

#### Hotellings tests to discriminate between types ####

points_sub <- points %>%
  filter(point_form == "side-notched") %>%
  filter(point_type != "undetermined") %>%
  select(object_id, point_type, x1:y8) %>%
  filter(!is.na(y8))

landmarks <- points_sub %>% # nolint
  dplyr::select(x1:y8) %>%
  names()

groups <- points_sub$point_type %>%
  unique() %>%
  sort()
u <- umap(points_sub %>% select(any_of(landmarks)))
udf <- u$layout %>%
  as.data.frame() %>%
  cbind(points_sub %>%
    select(object_id, point_type)) %>%
  as_tibble() %>%
  mutate(point_type = point_type %>% factor()) %>%
  rename(x = V1, y = V2)
results_hot_side <- run_hotellings(groups, "point_type", udf %>%
  select(-object_id))
results_hot_side

results_hot_side %>%
  filter(p_value < .1) %>%
  nrow() / nrow(results_hot_side)

points_sub <- points %>%
  filter(point_form == "triangular") %>%
  filter(point_type != "undetermined") %>%
  select(object_id, point_type, x1:y5) %>%
  filter(!is.na(y5))
landmarks <- points_sub %>%
  select(x1:y5) %>%
  names()
groups <- points_sub$point_type %>%
  unique() %>%
  sort()
u <- umap(points_sub %>% select(any_of(landmarks)))
udf <- u$layout %>%
  as.data.frame() %>%
  cbind(points_sub %>%
    select(object_id, point_type)) %>%
  as_tibble() %>%
  mutate(point_type = point_type %>% factor()) %>%
  rename(x = V1, y = V2)
results_hot_tri <- run_hotellings(groups, "point_type", udf %>%
  select(-object_id))
results_hot_tri

results_hot_tri %>%
  filter(p_value < .1) %>%
  nrow() / nrow(results_hot_tri)

points_sub <- points %>%
  select(object_id, point_cluster, x1:y5) %>%
  filter(!is.na(y5)) %>%
  filter(!point_cluster %in% c(""))
landmarks <- points_sub %>%
  select(x1:y5) %>%
  names()
groups <- points_sub$point_cluster %>%
  unique() %>%
  sort()
u <- umap(points_sub %>% select(any_of(landmarks)))
udf <- u$layout %>%
  as.data.frame() %>%
  cbind(points_sub %>%
    select(object_id, point_cluster)) %>%
  as_tibble() %>%
  mutate(point_cluster = point_cluster %>% factor()) %>%
  rename(x = V1, y = V2)
results_hot_shape <- run_hotellings(groups, "point_cluster", udf %>%
  select(-object_id))
results_hot_shape

points_sub <- points %>%
  filter(notch_height != "") %>%
  select(object_id, notch_height, x1:y8) %>%
  filter(!is.na(y8))
landmarks <- points_sub %>%
  select(x1:y8) %>%
  names()
groups <- points_sub$notch_height %>%
  unique() %>%
  sort()
u <- umap(points_sub %>% select(any_of(landmarks)))
udf <- u$layout %>%
  as.data.frame() %>%
  cbind(points_sub %>%
    select(object_id, notch_height)) %>%
  as_tibble() %>%
  mutate(notch_height = notch_height %>% factor()) %>%
  rename(x = V1, y = V2)
results_hot_notch <- run_hotellings(groups, "notch_height", udf %>%
  select(-object_id))
results_hot_notch

# easily distinguishes between notch heights

points_sub <- points %>%
  select(object_id, base_type, x1:y5) %>%
  filter(!is.na(y5)) %>%
  filter(!base_type %in% c("", "unknown"))
landmarks <- points_sub %>%
  select(x1:y5) %>%
  names()
groups <- points_sub$base_type %>%
  unique() %>%
  sort()
u <- umap(points_sub %>% select(any_of(landmarks)))
udf <- u$layout %>%
  as.data.frame() %>%
  cbind(points_sub %>%
    select(object_id, base_type)) %>%
  as_tibble() %>%
  mutate(base_type = base_type %>% factor()) %>%
  rename(x = V1, y = V2)
results_hot_base <- run_hotellings(groups, "base_type", udf %>%
  select(-object_id))
results_hot_base

table <- bind_rows(
  results_hot_side,
  results_hot_tri, results_hot_shape,
  results_hot_notch, results_hot_base
) %>%
  rename(
    `Group 1` = group1, `Group 2` = group2,
    `Group 1 Total` = n_group1, `Group 2 Total` = n_group2,
    `T-squared` = T_squared, `P-value` = p_value
  ) %>%
  distinct_all() %>%
  filter(`P-value` > .1) %>%
  filter(`Group 1 Total` > 9) %>%
  filter(`Group 2 Total` > 9)
table
# write_csv(table,"tables/hotelling_test_results_all.csv")

table %>%
  separate(`Group 1`, c("a1", "b1", "c1", "d1", "e1"), sep = "_", remove = F) %>%
  separate(`Group 2`, c("a2", "b2", "c2", "d2", "e2"), sep = "_", remove = F) %>%
  mutate(rowid = 1:n()) %>%
  select(
    all_of(c("rowid", "Group 1", "Group 2")),
    a1, b1, c1, d1, e1, a2, b2, c2, d2, e2
  ) %>%
  mutate_all(replace_na, "") %>%
  mutate(matches = rowSums(
    across(c(a1, b1, c1, d1, e1)) == across(c(a2, b2, c2, d2, e2))
  ))

#### compute mean shapes ####

types1 <- table(points %>% filter(point_type != "undetermined") %>%
  filter(point_form == "side-notched") %>%
  pull(point_type)) %>%
  as.data.frame() %>%
  arrange(desc(Freq)) %>%
  head(9) %>%
  pull(Var1)
types2 <- table(points %>% filter(point_type != "undetermined") %>%
  filter(point_form == "triangular") %>%
  pull(point_type)) %>%
  as.data.frame() %>%
  arrange(desc(Freq)) %>%
  head(9) %>%
  pull(Var1)
types <- c(types1, types2)
points_sub <- points %>%
  filter(point_type != "undetermined")
ntotal <- points_sub %>% nrow()
nsub <- points_sub %>%
  filter(point_type %in% types) %>%
  nrow()
nsub / ntotal
out <- readRDS(here("data/point_outlines.Rds"))
out$fac <- out$fac %>%
  select(object_id) %>%
  left_join(points) %>%
  distinct(object_id, .keep_all = T)

out_sub <- out %>%
  filter(point_type %in% types) %>%
  mutate(point_type = factor(point_type))

x <- coo_length(out_sub)

f <- x / max(x)

out_sub <- coo_scale(out_sub, f)

# Split the object by 'point_type' using chop
out_split <- chop(out_sub, fac = "point_type")

# Compute mean shapes for each category
mean_shapes_list <- lapply(out_split, function(coo_object) {
  MSHAPES(coo_object$coo)
})

# Combine the mean shapes into a new 'Out' object
mean_shapes <- Momocs::combine(mean_shapes_list %>% Out())

# Check the resulting object
mean_shapes$fac <- tibble(type = names(mean_shapes) %>% factor()) %>%
  mutate(form = ifelse(str_detect(type, "side-notched"), "SN", "Tri"))

mean_shapes_df <- mean_shapes %>%
  as_df() %>%
  mutate(coo = map(coo, as_tibble)) %>%
  unnest(coo)

mean_shapes_df <- mean_shapes_df %>%
  group_by(type) %>%
  mutate(row_number = row_number()) %>%
  filter(row_number == 1) %>%
  bind_rows(mean_shapes_df, .) %>%
  arrange(type, row_number) %>%
  select(-row_number)

generate_letter_combinations <- function(n) {
  if (n <= 26) {
    return(letters[1:n])
  } else {
    # Single letters (a-z)
    single_letters <- letters

    # Double letters (aa, ab, ac, ..., zz) in the correct order
    double_letters <- expand_grid(first = letters, second = letters) %>%
      mutate(combination = paste0(first, second)) %>%
      pull(combination)

    # Combine single and double letters and take the first 'n'
    return(c(single_letters, double_letters)[1:n])
  }
}

unique_types <- mean_shapes_df %>%
  filter(form == "SN") %>%
  pull(type) %>%
  unique() %>%
  sort()
letters_combined1 <- generate_letter_combinations(length(unique_types))

# Create the tibble
dict1 <- tibble(form = "SN", type = unique_types, letter1 = letters_combined1)

unique_types <- mean_shapes_df %>%
  filter(form == "Tri") %>%
  pull(type) %>%
  unique() %>%
  sort()
letters_combined2 <- generate_letter_combinations(length(unique_types))

# Create the tibble
dict2 <- tibble(form = "Tri", type = unique_types, letter2 = letters_combined2)

mean_shapes_df <- mean_shapes_df %>%
  left_join(dict1) %>%
  left_join(dict2) %>%
  mutate(letter = ifelse(is.na(letter1), letter2, letter1)) %>%
  mutate(letter = factor(letter,
    levels =
      unique(sort(c(letters_combined1, letters_combined2)))
  ))

hex_colors <- hcl.colors(length(letters_combined1), palette = "Spectral")
hcl_colors <- coords(as(hex2RGB(hex_colors), "polarLUV"))
hcl_colors[, "C"] <- hcl_colors[, "C"] * 1.25
saturated_colors <- hcl(hcl_colors[, "H"], hcl_colors[, "C"], hcl_colors[, "L"])

df_grps <- mean_shapes_df %>%
  group_split(form)

g1 <- ggplot(data = df_grps[[1]]) +
  geom_polygon(aes(x = x, y = y, group = type, fill = letter),
    alpha = .85, linewidth = 1, color = "black"
  ) +
  # scale_fill_viridis_d() +
  scale_fill_manual(values = saturated_colors) +
  theme_void() +
  coord_fixed(clip = "off") +
  guides(fill = "none") +
  theme(
    strip.text = element_text(size = 12, face = "bold"),
    element_rect(fill = "white")
  ) +
  facet_wrap(~letter, ncol = 3) +
  ggtitle("Side-Notched Points")

g2 <- ggplot(data = df_grps[[2]]) +
  geom_polygon(aes(x = x, y = y, group = type, fill = letter),
    alpha = .85, linewidth = 1, color = "black"
  ) +
  # scale_fill_viridis_d() +
  scale_fill_manual(values = saturated_colors) +
  theme_void() +
  coord_fixed(clip = "off") +
  guides(fill = "none") +
  theme(
    strip.text = element_text(size = 12, face = "bold"),
    element_rect(fill = "white")
  ) +
  facet_wrap(~letter, ncol = 3) +
  ggtitle("Triangular Points")

g <- cowplot::plot_grid(g1, g2, ncol = 2)
g
nm <- "mean_types"

# ggsave(file.path("figures",paste0(nm,".png")), dpi = 900,
#  width = 170, units = "mm")

# file.path("figures",paste0(nm,".png")) %>% image_read() %>%
#  image_trim() %>% image_border(geometry = "50x50", color = "white") %>%
#    image_write(file.path("figures",paste0(nm,".png")))

dict <- df_grps[[1]] %>%
  distinct(form, type, letter) %>%
  arrange(form)

paste(
  "For side-notched types:",
  map2_chr(dict$type, dict$letter, function(x, y) paste0(x, " (", y, ")")) %>%
    paste(collapse = ", ")
) %>% cat()

dict <- df_grps[[2]] %>%
  distinct(form, type, letter) %>%
  arrange(form)

paste(
  "For triangular types:",
  map2_chr(dict$type, dict$letter, function(x, y) paste0(x, " (", y, ")")) %>%
    paste(collapse = ", ")
) %>% cat()

#### Stack Points ####

types1 <- table(points %>% filter(point_type != "undetermined") %>%
  filter(point_form == "side-notched") %>%
  pull(point_type)) %>%
  as.data.frame() %>%
  arrange(desc(Freq)) %>%
  head(9) %>%
  pull(Var1)
types2 <- table(points %>% filter(point_type != "undetermined") %>%
  filter(point_form == "triangular") %>%
  pull(point_type)) %>%
  as.data.frame() %>%
  arrange(desc(Freq)) %>%
  head(9) %>%
  pull(Var1)
types <- c(types1, types2)
points_sub <- points %>%
  filter(point_type != "undetermined")
ntotal <- points_sub %>% nrow()
nsub <- points_sub %>%
  filter(point_type %in% types) %>%
  nrow()
nsub / ntotal
out <- readRDS(here("data/point_outlines.Rds"))

# x = out_sub %>%
# filter(object_id == "NA680.R3.9-1")

# out = out %>%
# filter(object_id != "NA3673T.174-1")

# stack(x)

out$fac <- out$fac %>%
  select(object_id) %>%
  left_join(points) %>%
  distinct(object_id, .keep_all = T)

out_sub <- out %>%
  filter(point_type %in% types) %>%
  mutate(point_type = factor(point_type))

out_aligned <- out_sub %>% fgProcrustes()

x <- coo_length(out_sub)

f <- x / max(x)

out_sub <- coo_scale(out_sub, f)

out_df <- out_aligned %>%
  as_df() %>%
  mutate(coo = map(coo, as_tibble)) %>%
  unnest(coo)

out_df <- out_df %>%
  group_by(point_type) %>%
  rowid_to_column("row_number") %>%
  filter(row_number == 1) %>%
  bind_rows(out_df, .) %>%
  arrange(point_type, row_number) %>%
  mutate(row_number = row_number())

generate_letter_combinations <- function(n) {
  if (n <= 26) {
    return(letters[1:n])
  } else {
    # Single letters (a-z)
    single_letters <- letters

    # Double letters (aa, ab, ac, ..., zz) in the correct order
    double_letters <- expand_grid(first = letters, second = letters) %>%
      mutate(combination = paste0(first, second)) %>%
      pull(combination)

    # Combine single and double letters and take the first 'n'
    return(c(single_letters, double_letters)[1:n])
  }
}

unique_types <- out_df %>%
  filter(point_form == "side-notched") %>%
  pull(point_type) %>%
  unique() %>%
  sort()
letters_combined1 <- generate_letter_combinations(length(unique_types))

# Create the tibble
dict1 <- tibble(
  point_form = "side-notched", point_type = unique_types,
  letter1 = letters_combined1
)

unique_types <- out_df %>%
  filter(point_form == "triangular") %>%
  pull(point_type) %>%
  unique() %>%
  sort()
letters_combined2 <- generate_letter_combinations(length(unique_types))

# Create the tibble
dict2 <- tibble(
  point_form = "triangular", point_type = unique_types,
  letter2 = letters_combined2
)

out_df <- out_df %>%
  left_join(dict1) %>%
  left_join(dict2) %>%
  mutate(letter = ifelse(is.na(letter1), letter2, letter1)) %>%
  mutate(letter = factor(letter,
    levels =
      unique(sort(c(letters_combined1, letters_combined2)))
  ))

hex_colors <- hcl.colors(length(letters_combined1), palette = "Spectral")
hcl_colors <- coords(as(hex2RGB(hex_colors), "polarLUV"))
hcl_colors[, "C"] <- hcl_colors[, "C"] * 1.25
saturated_colors <- hcl(hcl_colors[, "H"], hcl_colors[, "C"], hcl_colors[, "L"])

df_grps <- out_df %>%
  rename(x = V1, y = V2) %>%
  arrange(point_type, row_number) %>%
  group_split(point_form)

g1 <- ggplot(data = df_grps[[1]]) +
  geom_path(aes(x = x, y = y, group = object_id, color = letter),
    alpha = .85
  ) +
  scale_color_manual(values = saturated_colors) +
  theme_void() +
  coord_fixed(clip = "off") +
  guides(color = "none") +
  theme(
    strip.text = element_text(size = 12, face = "bold"),
    plot.background = element_rect(
      fill = "#e8eaed", color = "black", linewidth = 1.5
    ),
    plot.title = element_text(hjust = .1)
  ) +
  facet_wrap(~letter, ncol = 3) +
  ggtitle("Side-Notched Points")
g2 <- ggplot(data = df_grps[[2]]) +
  geom_path(aes(x = x, y = y, group = object_id, color = letter),
    alpha = .85
  ) +
  scale_color_manual(values = saturated_colors) +
  theme_void() +
  coord_fixed(clip = "off") +
  guides(color = "none") +
  theme(
    strip.text = element_text(size = 12, face = "bold"),
    plot.background = element_rect(
      fill = "#e8eaed", color = "black", linewidth = 1.5
    ),
    plot.title = element_text(hjust = .1)
  ) +
  facet_wrap(~letter, ncol = 3) +
  ggtitle("Triangular Points")

g <- cowplot::plot_grid(g1, g2, ncol = 2, align = "hv")
g
nm <- "types_stacked"

# ggsave(file.path("figures",paste0(nm,".png")),
#  dpi = 900, width = 170, units = "mm")
#
# file.path("figures",paste0(nm,".png")) %>%
# image_read() %>%
#  image_trim() %>%
#  image_border(geometry = "50x50", color = "white") %>%
#    image_write(file.path("figures",paste0(nm,".png")))

dict <- df_grps[[1]] %>%
  distinct(point_form, point_type, letter) %>%
  arrange(point_form)

paste(
  "For side-notched types:",
  map2_chr(dict$point_type, dict$letter, function(x, y) paste0(x, " (", y, ")")) %>%
    paste(collapse = ", ")
) %>% cat()

dict <- df_grps[[2]] %>%
  distinct(point_form, point_type, letter) %>%
  arrange(point_form)

paste(
  "For triangular types:",
  map2_chr(dict$point_type, dict$letter, function(x, y) paste0(x, " (", y, ")")) %>%
    paste(collapse = ", ")
) %>% cat()


#### Point Distributions  ####

points <- rio::import(file.path("data/heurist_projectilePoints.csv")) %>%
  janitor::clean_names() %>%
  as_tibble()

points <- points %>%
  filter(!point_type %in% c("undetermined"))

table(points$point_type) %>%
  t() %>%
  t()

# create hexagonal grids

site_data <- rio::import(file.path("data/heurist_sites.csv")) %>%
  janitor::clean_names() %>%
  as_tibble()
states <- rnaturalearth::ne_states() %>%
  filter(name %in% c("Arizona", "New Mexico")) %>%
  st_transform(26912)
sites <- site_data %>%
  select(site_name, long, lat, start_date, end_date) %>%
  filter(site_name != "") %>%
  distinct_all() %>%
  mutate_at(vars(long, lat), as.numeric) %>%
  filter(!is.na(long)) %>%
  st_as_sf(coords = c("long", "lat"), crs = 4326, remove = F) %>%
  st_transform(26912)

point_freq <- points %>%
  filter(site_name != "") %>%
  select(site_name, point_type) %>%
  group_by(site_name, point_type) %>%
  count(name = "count") %>%
  group_by(site_name) %>%
  mutate(total = sum(count)) %>%
  mutate(freq = count / total) %>%
  ungroup() %>%
  filter(total > 4)

point_freq <- inner_join(sites %>% select(site_name), point_freq)

sites <- sites %>%
  filter(site_name %in% point_freq$site_name)

grid <- create_grid_hexagonal(sites, cell_size = 36000)
types <- point_freq$point_type %>%
  unique() %>%
  sort()
p <- list()
for (type in types) {
  print(type)
  plotdf <- point_freq %>%
    filter(point_type == type)
  total_type <- sum(plotdf$count)
  if (total_type < 10) next # skip if less than 10 points
  grid <- create_grid_hexagonal(sites, cell_size = 36000)
  plotdf <- plotdf %>%
    select(-site_name) %>%
    st_join(grid %>% mutate(id = 1:n())) %>%
    group_by(id) %>%
    summarize(
      count = sum(count),
      total = sum(total), freq = count / total
    ) %>%
    st_centroid()
  r <- kde(plotdf %>% select(freq), 72000, grid = grid)
  p[[type]] <- ggplot() +
    geom_sf(data = states, fill = "white") +
    # geom_sf(data = grid_data, aes(fill = freq)) +
    geom_sf(data = r, aes(fill = kde_value)) +
    geom_sf(
      data = sites,
      color = "white", alpha = 0.75, shape = 17, size = 1
    ) +
    scale_fill_viridis_c() +
    theme_minimal() +
    theme(plot.margin = unit(c(2, 2, 2, 2), "mm")) + # Set margins to zero
    labs(title = type, subtitle = paste(total_type, "total points")) +
    coord_sf(xlim = c(150000, 850000), ylim = c(3600000, 4124890))
  ggsave(file.path(paste0(
    "figures/points_kde/",
    type, "-kde_hex_binned.png"
  )), width = 8, height = 4.5, dpi = 600)
}

point_freq <- points %>%
  filter(site_name != "") %>%
  filter(point_form %in% c("side-notched", "triangular")) %>%
  select(site_name, point_form) %>%
  group_by(site_name, point_form) %>%
  count(name = "count") %>%
  group_by(site_name) %>%
  mutate(total = sum(count)) %>%
  mutate(freq = count / total) %>%
  ungroup() %>%
  filter(total > 4)

point_freq <- inner_join(sites %>% select(site_name), point_freq)

sites <- sites %>%
  filter(site_name %in% point_freq$site_name)

grid <- create_grid_hexagonal(sites, cell_size = 36000)
types <- point_freq$point_form %>%
  unique() %>%
  sort()
p <- list()
for (type in types) {
  print(type)
  plotdf <- point_freq %>%
    filter(point_form == type)
  total_type <- sum(plotdf$count)
  if (total_type < 10) next # skip if less than 10 points
  grid <- create_grid_hexagonal(sites, cell_size = 36000)
  plotdf <- plotdf %>%
    select(-site_name) %>%
    st_join(grid %>% mutate(id = 1:n())) %>%
    group_by(id) %>%
    summarize(
      count = sum(count),
      total = sum(total), freq = count / total
    ) %>%
    st_centroid()
  r <- kde(plotdf %>% select(freq), 72000, grid = grid)
  p[[type]] <- ggplot() +
    geom_sf(data = states, fill = "white") +
    geom_sf(data = r, aes(fill = kde_value)) +
    geom_sf(
      data = sites,
      color = "white", alpha = 0.75, shape = 17, size = 1
    ) +
    scale_fill_viridis_c() +
    theme_minimal() +
    theme(plot.margin = unit(c(2, 2, 2, 2), "mm")) + # Set margins to zero
    labs(title = type, subtitle = paste(total_type, "total points")) +
    coord_sf(xlim = c(150000, 850000), ylim = c(3600000, 4124890))
  ggsave(file.path(paste0(
    "figures/points_kde/",
    type, "-kde_hex_binned.png"
  )), width = 8, height = 4.5, dpi = 600)
}

point_freq <- points %>%
  filter(site_name != "") %>%
  filter(point_form %in% c("side-notched", "triangular")) %>%
  select(site_name, point_cluster) %>%
  group_by(site_name, point_cluster) %>%
  count(name = "count") %>%
  group_by(site_name) %>%
  mutate(total = sum(count)) %>%
  mutate(freq = count / total) %>%
  ungroup() %>%
  filter(total > 4)

point_freq <- inner_join(sites %>% select(site_name), point_freq)

sites <- sites %>%
  filter(site_name %in% point_freq$site_name)
grid <- create_grid_hexagonal(sites, cell_size = 36000)
types <- point_freq$point_cluster %>%
  unique() %>%
  sort()
p <- list()
for (type in types) {
  print(type)
  plotdf <- point_freq %>%
    filter(point_cluster == type)
  total_type <- sum(plotdf$count)
  if (total_type < 10) next # skip if less than 10 points
  grid <- create_grid_hexagonal(sites, cell_size = 36000)
  plotdf <- plotdf %>%
    select(-site_name) %>%
    st_join(grid %>% mutate(id = 1:n())) %>%
    group_by(id) %>%
    summarize(
      count = sum(count),
      total = sum(total), freq = count / total
    ) %>%
    st_centroid()
  r <- kde(plotdf %>% select(freq), 72000, grid = grid)
  p[[type]] <- ggplot() +
    geom_sf(data = states, fill = "white") +
    geom_sf(data = r, aes(fill = kde_value)) +
    geom_sf(data = sites, color = "white", alpha = 0.75, shape = 17, size = 1) +
    scale_fill_viridis_c() +
    theme_minimal() +
    guides(fill = "none") +
    theme(plot.margin = unit(c(2, 2, 2, 2), "mm")) + # Set margins to zero
    labs(title = type, subtitle = paste(total_type, "total points")) +
    coord_sf(xlim = c(150000, 850000), ylim = c(3600000, 4124890))
  ggsave(
    file.path(
      paste0("figures/points_kde/", type, "-kde_hex_binned_nolegend.png")
    ),
    width = 8, height = 4.5, dpi = 600, bg = "white"
  )
}

combine_imgs <- function(imgs, nrow, file_name) {
  imgs <- lapply(imgs, function(x) {
    image_trim(x) %>%
      image_border(geometry = "50x50", color = "white") %>%
      image_scale("1500x")
  })
  if (length(imgs) %% 2 != 0) {
    print("adding extra image to make even number of images")
    info <- image_info(imgs[[1]])
    width <- info$width
    height <- info$height
    imgs <- c(imgs, image_blank(
      width = width, height = height, color = "white"
    ))
  }
  ncol <- length(imgs) / nrow
  sq <- seq(1, length(imgs), ncol)
  rows <- list()
  for (i in 1:nrow) {
    indx <- sq[i]:(sq[i] + ncol - 1)
    print(indx)
    rows[[i]] <- image_append(do.call(c, imgs[indx]))
  }
  final_image <- image_append(do.call(c, rows), stack = TRUE)
  final_image <- image_scale(final_image, "4500x")
  image_write(final_image, path = file_name, density = 900)
  print(paste("saved to", file_name))
}

imgs <- lapply(types, function(x) {
  image_read(
    file.path(paste0(
      "figures/points_kde/",
      x, "-kde_hex_binned_nolegend.png"
    ))
  )
})

# combine_imgs(imgs, nrow = 2,
#  file_name = file.path("figures/combined_hexbins.png"))

# combine top 4 types for side-notch and triangular

types_tri <- points %>%
  filter(point_form == "triangular") %>%
  select(point_type) %>%
  group_by(point_type) %>%
  count() %>%
  ungroup() %>%
  slice_max(n = 6, order_by = n) %>%
  pull(point_type)

types_side <- points %>%
  filter(point_form == "side-notched") %>%
  select(point_type) %>%
  group_by(point_type) %>%
  count() %>%
  ungroup() %>%
  slice_max(n = 6, order_by = n) %>%
  pull(point_type)

# imgs_tri = lapply(types_tri,function(x)
# image_read(file.path(paste0(
#   "figures/points_kde/",x,"-kde_hex_binned.png"
#   ))))
# combine_imgs(imgs = imgs_tri, nrow = 3,
#  file_name = file.path(
#   "figures/combined_hexbins_triangular.png"
#   ))
# imgs_side = lapply(types_side,function(x)
# image_read(file.path(paste0(
#   "figures/points_kde/",x,"-kde_hex_binned.png"
#   ))))
# combine_imgs(imgs = imgs_side, nrow = 3,
#  file_name = file.path(
#   "figures/combined_hexbins_side.png"
#   ))

#### Community Detection  ####

base <- readRDS(file.path("data/backgroundmap.Rds"))
base_tonto <- readRDS(file.path("data/backgroundmap_tonto.Rds"))

site_data <- rio::import(file.path("data/heurist_sites.csv")) %>%
  janitor::clean_names() %>%
  as_tibble()
states <- rnaturalearth::ne_states() %>%
  filter(name %in% c("Arizona", "New Mexico")) %>%
  st_transform(26912)
sites <- site_data %>%
  select(site_name, long, lat, start_date, end_date) %>%
  filter(site_name != "") %>%
  distinct_all() %>%
  mutate_at(vars(long, lat), as.numeric) %>%
  filter(!is.na(long)) %>%
  st_as_sf(coords = c("long", "lat"), crs = 4326, remove = F) %>%
  st_transform(26912)

points_sub <- points %>%
  left_join(sites %>% select(site_name, start_date, end_date)) %>%
  filter(!point_type %in% c("undetermined")) %>%
  filter(!site_name %in% c("", "Wupatki National Monument"))

d <- table(points_sub$site_name, points_sub$point_type) %>%
  prop.table(margin = 1) %>%
  as.matrix() %>%
  {
    . * 100
  }
m <- dist(d, method = "manhattan") %>% as.matrix()
m <- 1 - (m / 200)
diag(m) <- 0
colnames(m)
hist(m)
edges <- m %>%
  as.table() %>%
  as.data.frame() %>%
  setNames(c("from", "to", "weight")) %>%
  filter(from != "to") %>%
  filter(weight > quantile(weight, 0.75))
library(igraph)
g <- graph_from_data_frame(d = edges, directed = F)
g <- simplify(g)
plot(g)
communities <- cluster_louvain(g, weights = E(g)$weight, resolution = 1)
membership_df <- data.frame(
  community = membership(communities)
) %>%
  rownames_to_column("site_name")

sites_communities <- sites %>%
  inner_join(membership_df) %>%
  mutate(community = factor(community))

# mapview::mapview(sites_communities, zcol = "community")

coords <- st_coordinates(st_transform(sites_communities, 26912))

sites_communities_df <- sites_communities %>%
  st_drop_geometry() %>%
  mutate(x = coords[, 1], y = coords[, 2]) %>%
  distinct(lat, long, .keep_all = T) %>%
  mutate(community = factor(community))

tonto_boundary <- st_as_sfc(st_bbox(
  c(
    xmin = -111.04,
    ymin = 33.62, xmax = -110.92, ymax = 33.68
  ),
  crs = 4326
)) %>%
  st_transform(26912) %>%
  st_cast("POLYGON") %>%
  st_sfc() %>%
  st_sf() %>%
  mutate(id = "tonto boundary")

tonto_sites <- sites_communities_df %>%
  st_as_sf(coords = c("x", "y"), crs = 26912, remove = F) %>%
  st_crop(tonto_boundary)

table(tonto_sites$site_name, tonto_sites$community)

# Define shape and color mappings
shape_mapping <- c("1" = 16, "2" = 17, "3" = 18, "4" = 15) # Shape codes
color_mapping <- c(
  "1" = "#440154FF", "2" = "#FDE725FF",
  "3" = "#31688EFF", "4" = "#35B779FF"
) # Colors

insert <- base_tonto +
  geom_sf(data = tonto_sites, aes(
    color = community,
    shape = community
  ), alpha = 0.8, size = 3) +
  scale_shape_manual(values = shape_mapping) +
  scale_color_manual(values = color_mapping) +
  coord_sf(
    xlim = c(496289.7, 507420.6),
    ylim = c(3720024.3, 3726678.8)
  ) +
  guides(color = "none", shape = "none") +
  theme(plot.background = element_rect(
    fill = "white",
    color = "red", linewidth = 2
  )) +
  ggtitle("Roosevelt")
insert
insert <- ggplotGrob(insert)

xmin_inset <- 600000
xmax_inset <- 900000
ymin_inset <- 3660000
ymax_inset <- 3770000

g <- base +
  # ggplot() +
  geom_point(data = sites_communities_df, aes(
    x = x, y = y,
    color = community, shape = community
  ), alpha = 0.7, size = 3) +
  scale_shape_manual(values = shape_mapping) +
  scale_color_manual(values = color_mapping) +
  geom_sf(data = tonto_boundary, color = "white", fill = "red", alpha = 0.6) +
  annotation_custom(
    grob = insert,
    xmin = 600000, xmax = 900000, ymin = 3660000, ymax = 3770000
  ) +
  theme(legend.position = "top")

nm <- "point_type_boundaries"

# ggsave(file.path("figures",paste0(nm,".png")), dpi = 900, width = 170, units = "mm")

# file.path("figures",paste0(nm,".png")) %>% image_read() %>%
#  image_trim() %>% image_border(geometry = "50x50", color = "white") %>%
#    image_write(file.path("figures",paste0(nm,".png")))


#### Machine Learning Confusion Matrices  ####

# Load libraries
library(tidyverse)
library(janitor)
library(readxl)
library(cowplot)
library(ggtext)


# Define model categories and corresponding metadata columns
models <- list(
  bases = "base_type",
  notches = "notch_height",
  shape = "point_cluster",
  serrated = "serrated"
)

notches <- rio::import("data/predicted_types_notches.xlsx") %>%
  janitor::clean_names() %>%
  as_tibble()
serrated <- rio::import("data/predicted_types_serrated.xlsx") %>%
  janitor::clean_names() %>%
  as_tibble()
shape <- rio::import("data/predicted_types_shape.xlsx") %>%
  janitor::clean_names() %>%
  as_tibble()
bases <- rio::import("data/predicted_types_bases.xlsx") %>%
  janitor::clean_names() %>%
  as_tibble() %>%
  filter(base_type != "unknown")

dfs <- list(
  bases = bases,
  notches = notches,
  shape = shape,
  serrated = serrated
)

# Helper function to generate ggplot confusion matrix
plot_conf_mat <- function(df, model) {
  print(model)
  df <- tibble(truth = factor(df[[model]]), prediction = factor(df$predicted_type)) %>%
    filter(!is.na(truth), !is.na(prediction))

  tab <- table(df$truth, df$prediction)
  acc <- round(sum(diag(tab)) / sum(tab), 3)
  title <- str_replace_all(model, "_", " ") %>%
    str_to_title()

  as.data.frame(tab) %>%
    rename(Truth = Var1, Predicted = Var2, n = Freq) %>%
    ggplot(aes(x = Predicted, y = Truth, fill = n)) +
    geom_tile(color = "white") +
    geom_text(aes(label = n), color = "black", size = 4) +
    scale_fill_gradient(low = "white", high = "#2c7fb8") +
    theme_minimal(base_size = 10) +
    theme(
      axis.text.x = element_text(angle = 45, hjust = 1),
      legend.position = "none",
      plot.title = element_markdown(size = 11, face = "bold"),
      plot.background = element_rect(fill = "white", color = "black", linewidth = 1.5)
    ) +
    labs(
      title = glue::glue("{title}<br><span style='font-size:10pt;'>Accuracy = {acc}</span>"),
      x = "Predicted",
      y = "True"
    )
}

# Generate plots
plots <- map2(
  dfs,
  models,
  plot_conf_mat
)

# Combine into single figure
final_plot <- plot_grid(plotlist = plots, ncol = 2, labels = "AUTO")

# Save to file
fn <- "figures/confusion_matrices_combined.png"
ggsave(fn, final_plot, width = 6.5, height = 8, dpi = 600)

image_read(fn) %>%
  image_trim() %>%
  image_border(geometry = "50x50", color = "white") %>%
  image_write(fn)


#### export session info  ####
# Get the R version
r_version <- paste0("R version=", R.version$major, R.version$minor, sep = ".")

# Get the loaded packages and their versions
loaded_packages <- sessionInfo()$otherPkgs
loaded_versions <- sapply(
  loaded_packages,
  function(x) paste(x$Package, x$Version, sep = "=")
)

# Write the R version and package versions to the file
writeLines(c(r_version, loaded_versions), con = "analysis/r_environment2.qmd")

R Environment

R version=44.3. cowplot=1.1.3 Hotelling=1.0-8 corpcor=1.6.10 rnaturalearth=1.0.1 viridis=0.6.5 viridisLite=0.4.2 colorspace=2.1-1 ggfx=1.0.1 ggnewscale=0.5.1 ggrepel=0.9.6 terra=1.8-29 SpatialKDE=0.8.2 randomForest=4.7-1.2 umap=0.2.10.0 sf=1.0-19 magick=2.8.5 gt=0.11.1 flextable=0.9.7 NetworkDistance=0.3.4 vegan=2.6-10 lattice=0.22-6 permute=0.9-7 ggnetwork=0.5.13 igraph=2.1.4 rio=1.2.3 here=1.0.1 ggthemes=5.1.0 huxtable=5.5.7 kableExtra=1.4.0 Momocs=1.4.1 magrittr=2.0.3 lubridate=1.9.4 forcats=1.0.0 stringr=1.5.1 dplyr=1.1.4 purrr=1.0.4 readr=2.1.5 tidyr=1.3.1 tibble=3.2.1 ggplot2=3.5.1 tidyverse=2.0.0

Python Training Code

import os
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms
from PIL import Image
from torchvision.models import efficientnet_v2_s
import contextlib
import time
from sklearn.model_selection import train_test_split


def test_model(model, test_loader, device, criterion):
    model.eval()
    correct = 0
    total = 0
    test_loss = 0.0

    with torch.no_grad():
        for images, labels in test_loader:
            images, labels = images.to(device), labels.to(device)
            outputs = model(images)
            loss = criterion(outputs, labels)
            test_loss += loss.item()

            _, predicted = outputs.max(1)
            total += labels.size(0)
            correct += predicted.eq(labels).sum().item()

    avg_test_loss = test_loss / len(test_loader)
    test_accuracy = 100 * correct / total
    print(
        f"Test Loss: {avg_test_loss:.4f}, Test Accuracy: {test_accuracy:.2f}%")
    return avg_test_loss, test_accuracy


def train_and_save_model(class_type, form, include_types, version):
    # Load the Excel file
    data_path = "data/heurist_projectilePoints.csv"
    df = pd.read_csv(data_path)

    if isinstance(form, str):
        form = [form]

    df = df[df['Point Form'].isin(form)].copy()
    df = df[df[class_type].isin(include_types)].copy()
    df['Media file Name'] = df['objectID'].astype(str) + ".png"

    img_dir = "data/pointImages/ml"
    image_filenames = set(os.listdir(img_dir))
    df_exists = df[df['Media file Name'].isin(image_filenames)].copy()

    counts = df_exists[class_type].value_counts().reset_index()
    counts[f'{class_type}_new'] = counts.apply(
        lambda row: row[class_type] if row['count'] > 9 else 'other', axis=1
    )
    counts = counts[[class_type, f'{class_type}_new']]
    df_merged = pd.merge(df_exists, counts, on=class_type, how='left')
    df_merged = df_merged.drop(columns=[class_type])
    df_merged = df_merged.rename(columns={f'{class_type}_new': class_type})
    df_merged = df_merged[~df_merged[class_type].isin(
        ['Unknown', 'other', 'Unclassified']
    )].copy()
    df_merged = df_merged.dropna(subset=[class_type]).copy()
    train_df, test_df = train_test_split(
        df_merged, test_size=0.15,
        stratify=df_merged[class_type], random_state=42
    )

    # Define the number of rotations for each image to expand the dataset
    ROTATION_ANGLES = [0]  # You can adjust rotation angles if needed

    # Create a custom dataset for images and labels with augmentations
    class AugmentedPointClusterDataset(Dataset):
        def __init__(self, df, img_dir, transform=None,
                     rotation_angles=ROTATION_ANGLES):
            self.df = df
            self.img_dir = img_dir
            self.transform = transform
            self.rotation_angles = rotation_angles
            self.label_to_idx = {
                label: idx for idx, label in enumerate(include_types)}
            self.idx_to_label = {
                idx: label for label, idx in self.label_to_idx.items()}

        def __len__(self):
            return len(self.df) * len(self.rotation_angles)

        def __getitem__(self, idx):
            original_idx = idx // len(self.rotation_angles)
            rotation_angle = self.rotation_angles[idx % len(
                self.rotation_angles
            )]

            img_name = self.df.iloc[original_idx]["Media file Name"]
            img_path = os.path.join(self.img_dir, img_name)
            image = Image.open(img_path).convert("RGB")
            image = image.rotate(rotation_angle)

            if self.transform:
                image = self.transform(image)

            label = self.df.iloc[original_idx][class_type]
            label_idx = self.label_to_idx[label]

            return image, label_idx

    # Define transformations
    transform = transforms.Compose([
        transforms.Resize((224, 224)),
        transforms.RandomHorizontalFlip(),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[
                             0.229, 0.224, 0.225]),
    ])

    # Create Dataset and DataLoader with augmentations
    train_dataset = AugmentedPointClusterDataset(
        train_df, img_dir, transform=transform)
    test_dataset = AugmentedPointClusterDataset(
        test_df, img_dir, transform=transform)
    train_loader = DataLoader(
        train_dataset, batch_size=4, shuffle=True)
    test_loader = DataLoader(
        test_dataset, batch_size=4, shuffle=False)

    # Load the EfficientNetV2 model and adjust the classification head
    model = efficientnet_v2_s(pretrained=True)
    num_classes = len(include_types)
    # print(model.classifier)
    model.classifier[1] = nn.Linear(
        model.classifier[1].in_features, num_classes)  # type: ignore

    # Set up loss function and optimizer
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=1e-4)  # type: ignore

    # Training loop
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    print(device)

    epochs = 12
    form_string = "_".join(form)
    log_file_path = f"models/class_{class_type}_{form_string}_{version}.txt"

    for epoch in range(epochs):
        start_time = time.time()

        with contextlib.redirect_stdout(open(os.devnull, 'w')):
            model.train()

        running_loss = 0.0
        correct = 0
        total = 0
        for images, labels in train_loader:
            images, labels = images.to(device), labels.to(device)

            optimizer.zero_grad()
            outputs = model(images)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            running_loss += loss.item()
            _, predicted = outputs.max(1)
            total += labels.size(0)
            correct += predicted.eq(labels).sum().item()

        epoch_loss = running_loss / len(train_loader)
        epoch_accuracy = 100 * correct / total
        elapsed_time = time.time() - start_time

        print(f"""
              Epoch {epoch + 1}/{epochs},
              Loss: {epoch_loss:.4f}, 
              Accuracy: {epoch_accuracy:.2f}%, 
              Time: {elapsed_time:.2f} 
              seconds
              """)

        # Save the model for the current epoch
        model_save_path = f"""
        models/class_{class_type}_{form_string}_epoch_{epoch + 1}_{version}.pth
        """
        torch.save(model.state_dict(), model_save_path)
        print(f"Model saved to {model_save_path}")

        # Log training metrics to file
        with open(
            log_file_path, 'a'
        ) as log_file:
            log_file.write(
                f"""
                Epoch {epoch + 1}/{epochs},
                Loss: {epoch_loss:.4f},
                Accuracy: {epoch_accuracy:.2f}%\n
                """
            )

        # Evaluate on the test set and log test metrics
        test_loss, test_accuracy = test_model(
            model, test_loader, device, criterion)
        with open(log_file_path, 'a') as log_file:
            log_file.write(
                f"""
                Test Loss: {test_loss:.4f},
                Test Accuracy: {test_accuracy:.2f}%\n
                """
            )


# usage
data_path = "data/heurist_projectilePoints.csv"
df = pd.read_csv(data_path)
df.columns

# df['Point Form'].unique()

# df['Notch Height'].unique()
# class_type = 'Notch Height'
# form = ['side-notched']
# include_types = ["low","mid","high"]
# train_and_save_model(class_type=class_type, form=form,
#  include_types=include_types, version="v1")
# models\class_Notch Height_side-notched_epoch_12_v1.pth

df['Base Type'].unique()
class_type = 'Base Type'
form = ['side-notched', 'triangular']
include_types = ['straight', 'concave', 'deeply concave', 'convex']
train_and_save_model(class_type=class_type, form=form,
                     include_types=include_types, version="v2")
# models\class_Base Type_side-notched_triangular_epoch_12_v12.pth

# class_type = 'Point Cluster'
# df[class_type].unique()
# form = ['side-notched','triangular']
# include_types = ['shape-2', 'shape-3', 'shape-1']
# train_and_save_model(class_type=class_type, form=form,
#  include_types=include_types, version="v1")
# class_Point Cluster_side-notched_triangular_epoch_9_v1.pth

# class_type = 'Serrated'
# df[class_type].unique()
# form = ['side-notched','triangular']
# include_types = ['No', 'Yes']
# train_and_save_model(class_type=class_type, form=form,
#  include_types=include_types, version="v1")
# models\class_Serrated_side-notched_triangular_epoch_12_v1.pth

Python Evaluation Code

import os
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms
from PIL import Image
from torchvision.models import efficientnet_v2_s
import torch.nn.functional as F
from tqdm import tqdm


def predict(image_path, model, transform, labels, top_k=5, verbose=False):
    # Automatically adjust top_k based on the number of classes
    num_classes = len(labels)
    top_k = min(top_k, num_classes)

    model.eval()  # Set the model to evaluation mode
    image = Image.open(image_path).convert("RGB")
    image = transform(image).unsqueeze(0).to(device)

    with torch.no_grad():  # Disable gradient computation for inference
        outputs = model(image)  # Get the raw logits from the model

        # Apply softmax to get the probabilities
        probabilities = F.softmax(outputs, dim=1)

        # Get the top k probabilities and their corresponding indices
        top_probs, top_idxs = probabilities.topk(top_k, dim=1)

        # Convert the top probabilities and indices to lists
        top_probs = top_probs.cpu().numpy().flatten()
        top_idxs = top_idxs.cpu().numpy().flatten()

        # Print the top k predictions with probabilities
        if verbose:
            print("Top predictions:")
            for i in range(top_k):
                label = labels[top_idxs[i]]  # Get the label based on the index
                prob = top_probs[i]
                print(f"{i+1}: {label} ({prob:.4f})")

        predicted_idx = top_idxs[0]
        # Get the label from the labels list
        predicted_label = labels[predicted_idx]
        predicted_prob = top_probs[0]

    return predicted_label, predicted_prob


# Define class_type, form, and include_types for evaluation
class_type = 'Point Cluster'
form = ['side-notched', 'triangular']
include_types = ['shape-2', 'shape-3', 'shape-1']
# Define transformations
transform = transforms.Compose([
    transforms.Resize((224, 224)),  # Resize to 224x224 for EfficientNet
    transforms.RandomHorizontalFlip(),  # Randomly flip the image horizontally
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406],
                         std=[0.229, 0.224, 0.225]),
])

labels = include_types

# Load the EfficientNetV2 model and adjust the classification head
model = efficientnet_v2_s(pretrained=True)
model.classifier[1] = nn.Linear(
    model.classifier[1].in_features, len(labels)
)  # type: ignore

# Load the saved model weights
model_save_path = """
models/class_Base Type_side-notched_triangular_epoch_12_v2.pth
"""
model.load_state_dict(torch.load(model_save_path))
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
print(device)

# Define image directory and data path
img_dir = "data/pointImages/ml"

# Get a set of all image filenames in img_dir
image_filenames = set(os.listdir(img_dir))
if isinstance(form, str):
    form = [form]
data_path = "data/heurist_projectilePoints.csv"
df = pd.read_csv(data_path)
df['Media file Name'] = df['objectID'].astype(str) + ".png"
df_exists = df[df['Media file Name'].isin(image_filenames)].copy()
len(df_exists)
df_exists[class_type].unique()
df_unknown = df_exists
# df_unknown = df_exists[pd.isna(df_exists[class_type])]
# df_unknown = df_unknown[df_unknown['Point Form'].isin(form)].copy()
# df_unknown = df_unknown[['Projectile point H-ID',
#  'objectID', class_type, 'Media file Name', 'Point Form']]
# df_unknown['Predicted Type'] = None
# df_unknown['Prediction Probability'] = None
len(df_unknown)

# Predict and update df_unknown DataFrame
for image_name in tqdm(df_unknown['Media file Name'],
                       desc="Processing images"):
    if image_name.lower().endswith(".png"):
        image_path = os.path.join(img_dir, image_name)

        # Use the predict function to get
        #  the predicted class and probability
        predicted_cluster, predicted_prob = predict(
            image_path, model, transform, labels, top_k=5, verbose=False
        )

        # Set predicted_cluster to
        #  "Unclassified" if predicted_prob is below 0.5
        # predicted_cluster =
        #  "Unclassified" if predicted_prob < 0.5 else predicted_cluster

        # Update the df_unknown DataFrame with the predicted type and probability
        df_unknown.loc[df_unknown['Media file Name'] == image_name,
                       'Predicted Type'] = predicted_cluster
        df_unknown.loc[df_unknown['Media file Name'] == image_name,
                       'Prediction Probability'] = predicted_prob

# Save the updated df_unknown DataFrame to an Excel file
output_tsv_path = "data/predicted_types.tsv"
df_unknown.to_csv(output_tsv_path, sep='\t', index=False)

print(f"Results saved to {output_tsv_path}")

Python Environment

Note: this python environment was also used for all python code used in this dissertation.

python==3.8.19
aiofiles==23.2.1
aiohappyeyeballs==2.4.3
aiohttp==3.10.11
aiosignal==1.3.1
annotated-types==0.7.0
anyio==4.4.0
asgiref==3.8.1
asttokens==3.0.0
async-timeout==5.0.1
asyncer==0.0.8
attrs==24.2.0
backcall==0.2.0
backports.zoneinfo==0.2.1
Bottleneck
Brotli
build==1.2.2
cachetools==5.5.0
certifi 
cffi==1.17.1
charset-normalizer 
click==8.1.7
colorama==0.4.6
coloredlogs==15.0.1
comm==0.2.2
contourpy==1.1.1
cryptography==43.0.1
cycler==0.12.1
debugpy==1.8.13
decorator==5.2.1
distro==1.9.0
Django==4.2.16
efficientnet_pytorch==0.7.1
et-xmlfile==1.1.0
exceptiongroup==1.2.2
executing==2.2.0
fancycompleter==0.9.1
fastapi==0.115.5
ffmpy==0.4.0
filelock 
filetype==1.2.0
flatbuffers==24.3.25
fonttools==4.53.1
frozenlist==1.5.0
fsspec==2024.9.0
gmpy2 
google-ai-generativelanguage==0.1.0
google-api-core==2.19.2
google-auth==2.34.0
google-generativeai==0.1.0rc1
googleapis-common-protos==1.65.0
gradio==4.44.1
gradio_client==1.3.0
grpcio==1.66.1
grpcio-status==1.62.3
h11==0.14.0
httpcore==1.0.5
httpx==0.27.2
huggingface-hub==0.25.0
humanfriendly==10.0
idna 
imageio==2.35.1
importlib_metadata==8.5.0
importlib_resources==6.4.4
ipykernel==6.29.5
ipython==8.12.3
jedi==0.19.2
Jinja2 
jiter==0.5.0
joblib==1.4.2
jsonschema==4.23.0
jsonschema-specifications==2023.12.1
jupyter_client==8.6.3
jupyter_core==5.7.2
kiwisolver==1.4.5
lazy_loader==0.4
llvmlite==0.41.1
Markdown==3.7
markdown-it-py==3.0.0
MarkupSafe 
matplotlib==3.7.5
matplotlib-inline==0.1.7
mdurl==0.1.2
mkl-fft 
mkl-random 
mkl-service==2.4.0
mpmath 
multidict==6.1.0
munch==4.0.0
nest-asyncio==1.6.0
networkx 
numba==0.58.1
numexpr 
numpy 
onnxruntime==1.19.2
onnxruntime-gpu==1.19.2
openai==1.45.1
opencv-python-headless==4.10.0.84
openpyxl==3.1.5
orjson==3.10.11
packaging==24.2
pandas 
parso==0.8.4
pdbpp==0.10.3
pdf2image==1.17.0
pdfminer.six==20231228
pdfplumber==0.11.4
pickleshare==0.7.5
pillow 
pip-tools==7.4.1
pkgutil_resolve_name==1.3.10
platformdirs==4.3.6
pooch==1.8.2
pretrainedmodels==0.7.4
prompt_toolkit==3.0.50
propcache==0.2.0
proto-plus==1.24.0
protobuf==4.25.4
psutil==7.0.0
pure_eval==0.2.3
pyarrow==17.0.0
pyasn1==0.6.1
pyasn1_modules==0.4.1
pycparser==2.22
pydantic==2.9.1
pydantic_core==2.23.3
pydub==0.25.1
Pygments==2.19.1
PyMatting==1.1.13
pynndescent==0.5.13
pypandoc==1.13
pyparsing==3.1.2
PyPDF2==3.0.1
pypdfium2==4.30.0
pyproject_hooks==1.2.0
pyreadline==2.1
pyreadline3==3.5.4
pyrepl==0.9.0
PySocks
python-dateutil==2.9.0.post0
python-dotenv==1.0.1
python-multipart==0.0.17
pyTRS
pytz 
PyWavelets==1.4.1
pywin32==310
PyYAML 
pyzmq==26.3.0
referencing==0.35.1
regex==2024.9.11
rembg==2.0.59
requests 
rpds-py==0.20.1
rsa==4.9
ruff==0.7.4
safetensors==0.4.5
scikit-image==0.21.0
scikit-learn==1.3.2
scipy==1.10.1
segmentation-models-pytorch==0.3.3
semantic-version==2.10.0
shapely==2.0.6
shellingham==1.5.4
six==1.17.0
sniffio==1.3.1
sqlparse==0.5.1
stack-data==0.6.3
starlette==0.41.2
sympy
threadpoolctl==3.5.0
tifffile==2023.7.10
tiktoken==0.7.0
timm==0.9.2
tomli==2.0.1
tomlkit==0.12.0
torch==2.4.0
torchaudio==2.4.0
torchvision==0.19.0
tornado==6.4.2
tqdm==4.66.5
traitlets==5.14.3
typer==0.13.0
typing_extensions==4.12.2
tzdata 
umap-learn==0.5.6
urllib3 
uunet==2.2.1
uvicorn==0.32.0
watchdog==4.0.2
wcwidth==0.2.13
websockets==12.0
win-inet-pton 
wmctrl==0.5
yarl==1.15.2
zipp==3.20.2