8 KI-Gebrauch

8.1 Setup

Show the code

library(tidyverse)
library(lubridate)
library(gt)
library(targets)
library(ggpubr)
library(scales)
# library(ggfittext)
library(easystats)
library(data.table)
library(tinytable)
# library(ggokabeito)

Show the code

source("_common.r")

Show the code

tar_load(
  c(
    ai_transcript_clicks_per_month,
    data_long,
    data_separated_distinct_slice,
    data_separated_filtered,
    idvisit_has_llm,
    llm_response_text,
    n_mc_answers_selected,
    mc_answers_with_timestamps,
    prompt_length,
    prompt_length_date_uni_course,
    time_spent_w_course_university,
    llm_response_text_date_course_uni,
    n_interactions_w_llm_course_date_course_uni
  )
)

8.2 Interaktion mit dem LLM

Berechnungsgrundlage: Für diese Analyse wurden alle Events der Kategorie llm gefiltert.

8.2.1 Art und Anzahl der Interaktionen mit dem LLM

Show the code

data_separated_filtered_ai <-
  data_separated_filtered |>
  filter(type == "eventcategory") |>
  filter(str_detect(value, "llm")) |>
  count(value, sort = TRUE) |>
  mutate(prop = n / sum(n))

data_separated_filtered_ai

Show the code

data_separated_filtered_ai |>
  mutate(prop = round(prop, 3)) |>
  ggtexttable()

8.3 Anzahl der `message_to_llm`

Show the code

llm_interactions <-
  data_separated_filtered |>
  filter(str_detect(value, "message_to_llm"))

8.3.1 Verteilung

Show the code

llm_interactions_count <-
  llm_interactions |>
  count(idvisit, sort = TRUE) |>
  rename(messages_to_llm_n = n)

llm_interactions_count |>
  describe_distribution(messages_to_llm_n, centrality = c("mean", "median")) |>
  print_md()

Variable	Median	MAD	Mean	SD	IQR	Range	Skewness	Kurtosis	n	n_Missing
messages_to_llm_n	6	2.97	9.65	8.22	8	(2.00, 44.00)	1.86	3.71	185	0

8.3.2 Diagramm

Show the code

gghistogram(
  llm_interactions_count,
  x = "messages_to_llm_n",
  bins = 10,
  add = "median"
) +
  labs(caption = "The vertical dotted line denotes the median.")

8.4 Anteil Visitors, die mit dem LLM interagieren

8.4.1 idvisit

Show the code

data_separated_filtered_llm_interact <-
  data_separated_filtered |>
  mutate(has_llm = str_detect(value, "llm")) |>
  group_by(idvisit) |>
  summarise(llm_used_during_visit = any(has_llm == TRUE)) |>
  count(llm_used_during_visit) |>
  mutate(prop = round(n / sum(n), 2))

data_separated_filtered_llm_interact |>
  gt()

llm_used_during_visit	n	prop
FALSE	13419	0.94
TRUE	788	0.06

Show the code

data_separated_filtered_llm_interact |>
  ggtexttable()

8.4.2 fingerprint unique

Show the code

data_separated_filtered_llm_interact_fingerprint <-
  data_separated_filtered |>
  mutate(has_llm = str_detect(value, "llm")) |>
  group_by(fingerprint) |>
  summarise(llm_used_during_visit = any(has_llm == TRUE)) |>
  count(llm_used_during_visit) |>
  mutate(prop = round(n / sum(n), 2))

data_separated_filtered_llm_interact_fingerprint |>
  gt()

llm_used_during_visit	n	prop
FALSE	6649	0.93
TRUE	511	0.07

Show the code

data_separated_filtered_llm_interact_fingerprint |>
  ggtexttable()

8.4.3 … Im Zeitverlauf

8.4.3.1 Absolutzahlen

Show the code

idvisit_has_llm |>
  head(30)

Show the code

idvisit_has_llm_timeline <-
  idvisit_has_llm |>
  count(year_month, uses_llm) |>
  ungroup() |>
  group_by(year_month) |>
  mutate(prop = round(n / sum(n), 2))

idvisit_has_llm_timeline

Show the code

idvisit_has_llm_timeline |>
  ggtexttable()

Show the code

idvisit_has_llm |>
  count(year_month, uses_llm) |>
  ungroup() |>
  mutate(year_month_date = ymd(paste0(year_month, "-01"))) |>
  group_by(year_month_date) |>
  mutate(prop = n / sum(n)) |>
  ggplot(aes(
    x = year_month_date,
    y = prop,
    color = uses_llm,
    groups = uses_llm
  )) +
  # --- Highlight March–July (approx 1 Mar to 31 Jul) ---
  annotate(
    "rect",
    xmin = as.Date("2023-03-01"),
    xmax = as.Date("2023-07-31"),
    ymin = -Inf,
    ymax = Inf,
    alpha = 0.2,
    fill = "skyblue"
  ) +

  annotate(
    "rect",
    xmin = as.Date("2024-03-01"),
    xmax = as.Date("2024-07-31"),
    ymin = -Inf,
    ymax = Inf,
    alpha = 0.2,
    fill = "skyblue"
  ) +
  annotate(
    "rect",
    xmin = as.Date("2025-03-01"),
    xmax = as.Date("2025-07-31"),
    ymin = -Inf,
    ymax = Inf,
    alpha = 0.2,
    fill = "skyblue"
  ) +

  # --- Highlight October–February (semester break or 2nd term) ---
  annotate(
    "rect",
    xmin = as.Date("2023-10-01"),
    xmax = as.Date("2024-02-28"),
    ymin = -Inf,
    ymax = Inf,
    alpha = 0.2,
    fill = "orange"
  ) +
  # annotate("rect",
  #          xmin = as.Date("2024-10-01"), xmax = as.Date("2024-02-28"),
  #          ymin = -Inf, ymax = Inf, alpha = 0.2, fill = "orange") +
  annotate(
    "rect",
    xmin = as.Date("2024-10-01"),
    xmax = as.Date("2025-02-28"),
    ymin = -Inf,
    ymax = Inf,
    alpha = 0.2,
    fill = "orange"
  ) +
  geom_point() +
  geom_line(aes(group = uses_llm)) +
  labs(
    title = "Visitors, die mit dem LLM interagieren im Zeitverlauf (Anteile)"
  ) +
  scale_x_date(breaks = pretty_breaks())

Show the code

idvisit_has_llm |>
  count(year_month, uses_llm) |>
  ungroup() |>
  mutate(year_month_date = ymd(paste0(year_month, "-01"))) |>
  group_by(year_month) |>
  ggplot(aes(x = year_month_date, y = n, color = uses_llm, groups = uses_llm)) +
  # --- Highlight March–July (approx 1 Mar to 31 Jul) ---
  annotate(
    "rect",
    xmin = as.Date("2023-03-01"),
    xmax = as.Date("2023-07-31"),
    ymin = -Inf,
    ymax = Inf,
    alpha = 0.2,
    fill = "skyblue"
  ) +

  annotate(
    "rect",
    xmin = as.Date("2024-03-01"),
    xmax = as.Date("2024-07-31"),
    ymin = -Inf,
    ymax = Inf,
    alpha = 0.2,
    fill = "skyblue"
  ) +
  annotate(
    "rect",
    xmin = as.Date("2025-03-01"),
    xmax = as.Date("2025-07-31"),
    ymin = -Inf,
    ymax = Inf,
    alpha = 0.2,
    fill = "skyblue"
  ) +

  # --- Highlight October–February (semester break or 2nd term) ---
  annotate(
    "rect",
    xmin = as.Date("2023-10-01"),
    xmax = as.Date("2024-02-28"),
    ymin = -Inf,
    ymax = Inf,
    alpha = 0.2,
    fill = "orange"
  ) +
  # annotate("rect",
  #          xmin = as.Date("2024-10-01"), xmax = as.Date("2024-02-28"),
  #          ymin = -Inf, ymax = Inf, alpha = 0.2, fill = "orange") +
  annotate(
    "rect",
    xmin = as.Date("2024-10-01"),
    xmax = as.Date("2025-02-28"),
    ymin = -Inf,
    ymax = Inf,
    alpha = 0.2,
    fill = "orange"
  ) +
  geom_point() +
  geom_line(aes(group = uses_llm)) +
  labs(
    title = "Visitors, die mit dem LLM interagieren im Zeitverlauf (Anzahl)"
  ) +
  scale_x_date(breaks = pretty_breaks())

8.4.3.2 Anteile

Show the code

idvisit_has_llm |>
  count(year_month, uses_llm) |>
  ungroup() |>
  mutate(year_month_date = ymd(paste0(year_month, "-01"))) |>
  group_by(year_month_date) |>
  # ADDED: Calculate the proportion
  mutate(proportion = n / sum(n)) |>

  # Plot using the new 'proportion' variable
  ggplot(aes(x = year_month_date, y = proportion, fill = uses_llm)) +

  # ADDED: Use position = "fill"
  geom_area(position = "fill") +

  # ADDED: Format y-axis as percentage
  scale_y_continuous(labels = scales::label_percent()) +

  labs(
    title = "Anteil der Besucher, die mit dem LLM interagieren (Prozent)",
    y = "Prozentualer Anteil der Besucher",
    fill = "Interagiert mit LLM",
    x = "Datum"
  ) +
  scale_x_date(breaks = pretty_breaks())

8.5 Länge der Prompts (Input an das LLM)

8.5.1 Überblick

Show the code

prompt_length_no_prompts <-
  prompt_length |>
  select(-any_of(c("prompt", "value", "type")))

Show the code

prompt_length_no_prompts |>
  describe_distribution(token_length) |>
  print_md()

Variable	Mean	SD	IQR	Range	Skewness	Kurtosis	n	n_Missing
token_length	6.86	6.16	4	(1.00, 79.00)	4.76	43.63	893	0

Show the code

prompt_length_no_prompts |>
  ggplot(aes(x = token_length)) +
  geom_histogram(binwidth = 10) +
  labs(
    title = "Length of prompts sent to the LLM",
    x = "Prompt length (in tokens)",
    y = "Number of prompts"
  ) +
  theme_minimal()

Show the code

describe_distribution(
  prompt_length$prompt_length,
  centrality = c("mean", "median")
)

NULL

8.5.2 Token-Länge nach Universitäten

Show the code

prompt_length_date_uni_course |>
  group_by(university) |>
  describe_distribution(token_length) |>
  print_md()

university	Variable	Mean	SD	IQR	Range	Skewness	Kurtosis	n	n_Missing
evhn	token_length				(Inf, -Inf)			0	6
hnu	token_length	7.00	0.00	0	(7.00, 7.00)			6	12
hs-ansbach	token_length				(Inf, -Inf)			0	3
hswt	token_length	6.54	4.08	9	(1.00, 12.00)	-0.15	-1.47	48	1094
iqw	token_length				(Inf, -Inf)			0	1
th-nuernberg	token_length	8.11	9.20	8	(1.00, 79.00)	4.38	28.22	246	4224
th-owl	token_length				(Inf, -Inf)			0	1
thi	token_length	10.00	10.03	18	(2.00, 26.00)	1.31	-0.14	8	438

Show the code

ggboxplot(
  prompt_length_date_uni_course,
  x = "university",
  y = "token_length",
  add = "mean_se",
) +
  theme_minimal() +
  labs(
    title = "Prompt length by university",
    x = "University",
    y = "Prompt length (in tokens)"
  ) +
  coord_flip()

8.5.3 Token-Länge nach Kursen

Show the code

prompt_length_date_uni_course |>
  group_by(course) |>
  describe_distribution(token_length) |>
  print_md()

course	Variable	Mean	SD	IQR	Range	Skewness	Kurtosis	n	n_Missing
anis2	token_length				(Inf, -Inf)			0	43
armeufa	token_length				(Inf, -Inf)			0	2
bare	token_length	7.00	0.00	0	(7.00, 7.00)			6	12
bio	token_length				(Inf, -Inf)			0	168
biotech	token_length				(Inf, -Inf)			0	127
biovete	token_length				(Inf, -Inf)			0	244
cta1	token_length	6.54	4.08	9	(1.00, 12.00)	-0.15	-1.47	48	1094
daba	token_length	5.00	0.00	0	(5.00, 5.00)			2	197
enerbi	token_length				(Inf, -Inf)			0	10
epsy	token_length				(Inf, -Inf)			0	1
etechde	token_length				(Inf, -Inf)			0	3
fodesoa	token_length	3.00	0.00	0	(3.00, 3.00)			2	324
fomesoa	token_length				(Inf, -Inf)			0	6
fosaq	token_length	4.00	0.00	0	(4.00, 4.00)			2	59
gdi	token_length	7.67	2.88	6	(4.00, 10.00)	-0.86	-1.88	6	47
gemwesa	token_length				(Inf, -Inf)			0	57
gesoa	token_length	6.67	9.64	4	(1.00, 79.00)	5.88	41.54	150	2315
kore	token_length				(Inf, -Inf)			0	3
mat11akzg	token_length				(Inf, -Inf)			0	2
mibio	token_length				(Inf, -Inf)			0	206
mioek	token_length	10.00	10.03	18	(2.00, 26.00)	1.31	-0.14	8	435
nlp	token_length	8.00	3.46	6	(5.00, 11.00)	0.00	-6.00	4	65
quame1	token_length				(Inf, -Inf)			0	3
quanch	token_length				(Inf, -Inf)			0	34
softa	token_length				(Inf, -Inf)			0	4
thesoa	token_length	11.15	8.45	8	(1.00, 32.00)	1.29	0.97	80	272
wirkori	token_length				(Inf, -Inf)			0	5
wisoa	token_length				(Inf, -Inf)			0	33
zemiws	token_length				(Inf, -Inf)			0	1

Show the code

ggboxplot(
  prompt_length_date_uni_course,
  x = "course",
  y = "token_length",
  add = "mean_se",
) +
  theme_minimal() +
  labs(
    title = "Prompt length by course",
    x = "Course",
    y = "Prompt length (in tokens)"
  ) +
  coord_flip()

8.5.4 Token-Länge im Zeitverlauf

Show the code

prompt_length_date_uni_course |>
  group_by(floor_date_month) |>
  describe_distribution(token_length) |>
  print_md()

floor_date_month	Variable	Mean	SD	IQR	Range	Skewness	Kurtosis	n	n_Missing
2022-12-01	token_length				(Inf, -Inf)			0	329
2023-01-01	token_length				(Inf, -Inf)			0	455
2023-02-01	token_length				(Inf, -Inf)			0	561
2023-03-01	token_length				(Inf, -Inf)			0	149
2023-04-01	token_length				(Inf, -Inf)			0	253
2023-05-01	token_length				(Inf, -Inf)			0	391
2023-06-01	token_length				(Inf, -Inf)			0	292
2023-07-01	token_length				(Inf, -Inf)			0	441
2023-08-01	token_length				(Inf, -Inf)			0	26
2023-09-01	token_length				(Inf, -Inf)			0	39
2023-10-01	token_length				(Inf, -Inf)			0	614
2023-11-01	token_length	5.57	5.56	7.00	(1.00, 17.00)	1.77	3.36	7	656
2023-12-01	token_length				(Inf, -Inf)			0	519
2024-01-01	token_length	6.67	1.15	2.00	(6.00, 8.00)	1.73	-1.50	3	781
2024-02-01	token_length				(Inf, -Inf)			0	85
2024-03-01	token_length	5.54	2.47	1.00	(2.00, 13.00)	2.25	5.41	26	133
2024-04-01	token_length	5.33	1.67	2.00	(2.00, 9.00)	0.15	0.44	30	322
2024-05-01	token_length	3.57	1.81	4.00	(1.00, 5.00)	-0.98	-1.08	7	410
2024-06-01	token_length	8.16	5.82	9.00	(1.00, 26.00)	1.15	1.25	106	571
2024-07-01	token_length	7.14	6.07	13.00	(3.00, 16.00)	1.21	-0.86	7	740
2024-08-01	token_length				(Inf, -Inf)			0	16
2024-09-01	token_length	6.12	3.00	5.00	(4.00, 11.00)	0.82	-1.49	8	21
2024-10-01	token_length	7.26	11.67	4.00	(1.00, 79.00)	4.94	27.91	100	714
2024-11-01	token_length	4.46	2.44	3.00	(1.00, 12.00)	0.88	1.11	76	900
2024-12-01	token_length	6.54	6.18	4.00	(2.00, 26.00)	2.64	6.71	26	759
2025-01-01	token_length	8.19	6.83	8.00	(1.00, 32.00)	2.16	4.85	148	936
2025-02-01	token_length	4.90	1.74	2.00	(2.00, 8.00)	0.17	-0.44	20	152
2025-03-01	token_length	8.03	5.88	8.00	(1.00, 21.00)	0.79	-0.36	62	490
2025-04-01	token_length	6.16	3.68	4.00	(1.00, 17.00)	0.77	0.24	191	978
2025-05-01	token_length	6.38	2.77	5.00	(3.00, 11.00)	0.26	-1.47	37	546
2025-06-01	token_length	8.72	6.69	3.00	(1.00, 26.00)	1.33	1.77	29	315
2025-07-01	token_length	6.20	2.62	2.50	(4.00, 11.00)	1.50	0.86	10	427

Show the code

# Calculate limits properly
filtered_data <- prompt_length_date_uni_course |>
  filter(!is.na(floor_date_month)) |>
  mutate(floor_date_month_date = as.Date(floor_date_month))

lim <- c(
  min(filtered_data$floor_date_month_date, na.rm = TRUE),
  max(filtered_data$floor_date_month_date, na.rm = TRUE)
)

# Now create the plot
filtered_data |>
  ggplot(aes(x = floor_date_month_date, y = token_length)) +
  geom_violin(aes(group = floor_date_month_date)) +
  stat_summary(fun = "mean", geom = "point") +
  stat_summary(fun.data = "mean_se", geom = "errorbar", width = 0.2) +
  theme_minimal() +
  labs(
    title = "Prompt length over time",
    x = "Date",
    y = "Prompt length (in tokens)",
    caption = "The dots represent the mean and the error bars the standard error of the mean."
  ) +
  scale_x_date(limits = lim, labels = scales::label_date_short())

8.6 Anzahl der Interaktionen bei den Usern, die mit dem LLM interagieren

8.6.1 Insgesamt

Show the code

d_n_interactions_w_llm <-
  data_separated_filtered |>
  filter(type == "eventcategory") |>
  filter(str_detect(value, "llm")) |>
  group_by(idvisit) |>
  summarise(n_interactions_w_llm = n())

Show the code

d_n_interactions_w_llm |>
  select(n_interactions_w_llm) |>
  describe_distribution() |>
  print_md()

Variable	Mean	SD	IQR	Range	Skewness	Kurtosis	n	n_Missing
n_interactions_w_llm	165.59	216.45	459	(1.00, 500.00)	0.69	-1.46	640	0

Show the code

d_n_interactions_w_llm |>
  ggplot(aes(x = n_interactions_w_llm)) +
  geom_histogram()

8.6.2 Interaktionen mit dem LLM - pro Kurs und pro Uni

Show the code

n_interactions_w_llm_course_date_course_uni |>
  # select(n_interactions_w_llm) |>
  group_by(university, course) |>
  summarise(n = n())

Show the code

n_interactions_w_llm_course_date_course_uni |>
  # select(n_interactions_w_llm) |>
  group_by(university, course) |>
  summarise(n = n()) |>
  ggplot(aes(x = reorder(course, n), y = n)) +
  geom_col() +
  coord_flip() +
  labs(x =  "Course")

Show the code

#facet_wrap(~ university, scales = "free_y")

8.6.3 Interaktionen mit dem LLM - im Zeitverlauf

Show the code

rect_data <- comp_semester_rects(
  n_interactions_w_llm_course_date_course_uni,
  col_date = "floor_date_month"
)

n_interactions_w_llm_course_date_course_uni |>
  group_by(floor_date_month) |>
  summarise(n = n()) |>
  ggplot(aes(x = floor_date_month, y = n)) +
  geom_rect(
    data = rect_data,
    aes(xmin = xmin, xmax = xmax, ymin = ymin, ymax = ymax),
    fill = "grey",
    alpha = 0.2,
    inherit.aes = FALSE # Essential to use the rect_data columns
  ) +
  geom_line() +
  labs(x = "Date", y = "Number of interactions with LLM")

Show the code

# --- 1. Prepare Data ---
# Your original data processing for the plot
plot_data <- n_interactions_w_llm_course_date_course_uni |>
  group_by(floor_date_month) |>
  summarise(n = n()) |>
  ungroup() # Ungroup after summarise for easier use with ggplot

# --- 2. Determine Plot Range for Rectangles ---
# Find the min/max year and n-count from your *processed* plot_data
min_date <- min(plot_data$floor_date_month, na.rm = TRUE)
max_date <- max(plot_data$floor_date_month, na.rm = TRUE)
min_year <- year(min_date)
max_year <- year(max_date)

# Determine the Y-axis bounds for the rectangles
y_min <- min(plot_data$n, na.rm = TRUE)
y_max <- max(plot_data$n, na.rm = TRUE)

# --- 3. Calculate the Rectangle Coordinates (rect_data) ---

# Generate years for the rectangles, ensuring we cover the full range
# including potentially starting a "winter" semester in the min_year-1
# and ending in max_year+1
rect_years <- seq(min_year - 1, max_year + 1)

# Summer semester: March 1 (Y) to July 1 (Y)
summer_rects <- tibble(year = rect_years) |>
  mutate(
    xmin = ymd(paste0(year, "-03-01")),
    xmax = ymd(paste0(year, "-07-01"))
  )

# Winter semester: October 1 (Y) to February 1 (Y+1)
winter_rects <- tibble(year = rect_years) |>
  mutate(
    xmin = ymd(paste0(year, "-10-01")),
    xmax = ymd(paste0(year + 1, "-02-01"))
  )

# Combine, set Y bounds, and filter to the actual plot area
rect_data <- bind_rows(summer_rects, winter_rects) |>
  mutate(ymin = y_min, ymax = y_max) |>
  # Only keep rectangles that are fully or partially within the plot's X range
  filter(
    xmin <= max_date,
    xmax >= min_date
  )

# --- 4. Generate the Final Plot ---
plot_data |>
  ggplot(aes(x = floor_date_month, y = n)) +
  # Add the transparent grey rectangles first
  geom_rect(
    data = rect_data,
    aes(xmin = xmin, xmax = xmax, ymin = ymin, ymax = ymax),
    fill = "grey",
    alpha = 0.2,
    inherit.aes = FALSE # Essential to use the rect_data columns
  ) +
  # Then plot the lines and points on top
  geom_line() +
  geom_point() + # Added point layer for clarity at each month
  theme_minimal() +
  labs(
    title = "Number of Interactions with LLM per Course Date per University",
    x = "Date",
    y = "Number of Interactions"
  )

8.7 Klick auf ein Wort im Transkript

Ausgewertet wird im Folgenden die Variable “click_transcript_word”.

8.7.1 Insgesamt

Show the code

data_separated_filtered |>
  filter(type == "subtitle") |>
  # rm empty rows:
  filter(!is.na(value) & value != "") |>
  count(click_transcript_word = str_detect(value, "click_transcript_word")) |>
  mutate(prop = round(n / sum(n), 2)) |>
  gt()

click_transcript_word	n	prop
FALSE	1138774	0.99
TRUE	8439	0.01

8.7.2 Im Zeitverlauf

8.7.2.1 idvisit

Show the code

click_transcript_word_per_month <-
  data_separated_filtered |>
  # rm all groups WITHOUT "click_transcript_word":
  group_by(idvisit) |>
  filter(!any(value = str_detect(value, "click_transcript_word"))) |>
  ungroup() |>
  mutate(date_visit = ymd_hms(value)) |>
  mutate(month_visit = floor_date(date_visit, unit = "month")) |>
  drop_na(date_visit) |>
  group_by(idvisit) |>
  slice(1) |>
  ungroup() |>
  count(month_visit)

click_transcript_word_per_month

Show the code

rect_data_word_per_month <- comp_semester_rects(
  click_transcript_word_per_month,
  col_date = "month_visit"
)

click_transcript_word_per_month |>
  ggplot(aes(x = month_visit, y = n)) +
  geom_rect(
    data = rect_data,
    aes(xmin = xmin, xmax = xmax, ymin = ymin, ymax = Inf),
    fill = "grey",
    alpha = 0.2,
    inherit.aes = FALSE # Essential to use the rect_data columns
  ) +
  geom_line() +
  geom_smooth(method = "loess", se = FALSE, color = "blue", alpha = 0.7) +
  scale_x_date(labels = scales::label_date_short())

8.7.2.2 fingerprint unique

Show the code

click_transcript_word_per_month_fingerprint <-
  data_separated_filtered |>
  # rm all groups WITHOUT "click_transcript_word":
  group_by(fingerprint) |>
  filter(!any(value = str_detect(value, "click_transcript_word"))) |>
  ungroup() |>
  mutate(date_visit = ymd_hms(value)) |>
  mutate(month_visit = floor_date(date_visit, unit = "month")) |>
  drop_na(date_visit) |>
  group_by(fingerprint) |>
  slice(1) |>
  ungroup() |>
  count(month_visit)

click_transcript_word_per_month_fingerprint

Show the code

click_transcript_word_per_month_fingerprint |>
  ggplot(aes(x = month_visit, y = n)) +
  geom_rect(
    data = rect_data,
    aes(xmin = xmin, xmax = xmax, ymin = ymin, ymax = Inf),
    fill = "grey",
    alpha = 0.2,
    inherit.aes = FALSE # Essential to use the rect_data columns
  ) +
  geom_line() +
  geom_smooth(method = "loess", se = FALSE, color = "blue", alpha = 0.7)

8.8 KI-Aktionen

8.8.1 Insgesamt (ganzer Zeitraum)

Show the code

data_long |>
  head(300)

8.8.2 Im Detail

Show the code

regex_pattern <- "Category: \"(.*?)(?=', Action)"

# Explaining this regex_pattern:
# Find the literal string
# 1. `Category: ` (surrounded by quotation marks)
# 2. Capture any characters (.*?) that follow, non-greedily, until...
# 3. ...it encounters the literal sequence,  ` Action`) immediately after the captured string.

ai_actions_count <-
  data_long |>
  # slice(1:1000) |>
  filter(str_detect(value, "transcript")) |>
  mutate(category = str_extract(value, regex_pattern)) |>
  select(category) |>
  mutate(category = str_replace_all(category, "[\"']", "")) |>
  count(category, sort = TRUE)

ai_actions_count |>
  tt()

category	n
NA	217862
Category: clear_transcript_text_for_llm_context	104111
Category: click_transcript_word	8439
Category: select_transcript_text_for_llm_context	576
Category: click_button	43
Category: llm_response_de	3
Category: llm_response_en	3

8.8.3 KI-Klicks pro Monat

Im Objekt wird gezählt, wie oft der String "click_transcript_word" in den Daten (Langformat) gefunden wird, s. Target ai_transcript_clicks_per_month in der Targets-Pipeline.

Show the code

ai_transcript_clicks_per_month |>
  head(30)

Show the code

ai_transcript_clicks_per_month_count <-
  ai_transcript_clicks_per_month |>
  count(year_month, clicks_transcript_any) |>
  ungroup() |>
  group_by(year_month) |>
  mutate(prop = round(n / sum(n), 2))

ai_transcript_clicks_per_month_count

Show the code

ai_transcript_clicks_per_month_count |>
  ggtexttable()

Show the code

rect_data <- comp_semester_rects(
  ai_transcript_clicks_per_month_count,
  col_date = "year_month"
)
# try common lubridate parsers (datetime -> date)

ai_transcript_clicks_per_month_count |>
  mutate(date = ymd(paste0(year_month, "-01"))) |>
  ggplot(aes(x = date, y = n)) +
  geom_rect(
    data = rect_data,
    aes(xmin = xmin, xmax = xmax, ymin = ymin, ymax = ymax),
    fill = "grey",
    alpha = 0.2,
    inherit.aes = FALSE # Essential to use the rect_data columns
  ) +
  geom_line(group = 1) +
  geom_point() +
  theme_minimal() +
  labs(
    title = "Number of AI transcript clicks per month",
    x = "date [months]"
  ) +
  geom_smooth(method = "loess", se = FALSE, color = "blue", alpha = 0.2)

Show the code

# --- 1. Prepare Data and Determine Year Range ---
# We assign the processed data to a temporary variable to calculate the year range.
ai_clicks_data <-
  ai_transcript_clicks_per_month_count |>
  mutate(date = ymd(paste0(year_month, "-01")))

# --- 2. Calculate the Vertical Line Intercept Dates ---
# Find the min/max year in the data
min_year <- min(year(ai_clicks_data$date), na.rm = TRUE)
max_year <- max(year(ai_clicks_data$date), na.rm = TRUE)
years <- seq(min_year, max_year)

# Define the target months: February (2), March (3), July (7), October (10)
vline_dates <- expand.grid(
  year = years,
  month = c(2, 3, 7, 10)
) |>
  mutate(
    date_str = paste0(year, "-", month, "-01"),
    vline_date = ymd(date_str)
  ) |>
  # Filter to only include dates within the actual data range for plotting
  filter(
    vline_date >= min(ai_clicks_data$date) &
      vline_date <= max(ai_clicks_data$date)
  ) |>
  pull(vline_date) |>
  as.Date()

# --- 3. Generate the Final Plot ---
ai_clicks_data |>
  ggplot(aes(x = date, y = n)) +
  # Add the vertical lines
  geom_vline(
    xintercept = vline_dates,
    color = "darkred",
    linetype = "dashed",
    alpha = 0.6
  ) +
  geom_line(group = 1) +
  geom_point() +
  theme_minimal() +
  labs(title = "Number of AI transcript clicks per month", x = "date [months]") +
  geom_smooth(method = "loess", se = FALSE, color = "blue", alpha = 0.5)

Show the code

# --- 1. Prepare Data and Determine Year Range ---
# We assign the processed data to a temporary variable to calculate the year range.
ai_clicks_data <- ai_transcript_clicks_per_month_count |>
  mutate(date = ymd(paste0(year_month, "-01")))

# Find the min/max year and n-count in the data
min_year <- min(year(ai_clicks_data$date), na.rm = TRUE)
max_year <- max(year(ai_clicks_data$date), na.rm = TRUE)
years <- seq(min_year, max_year)

# Determine the Y-axis bounds for the rectangles
y_min <- min(ai_clicks_data$n, na.rm = TRUE)
y_max <- max(ai_clicks_data$n, na.rm = TRUE)

# --- 2. Calculate the Rectangle Coordinates (rect_data) ---

# Period 1: March 1 (Y) to July 1 (Y)
rect_data_mar_jul <- tibble(year = years) |>
  mutate(
    xmin = ymd(paste0(year, "-03-01")),
    xmax = ymd(paste0(year, "-07-01"))
  )

# Period 2: October 1 (Y) to February 1 (Y+1)
# We need to include max_year + 1 in the sequence to capture the end dates
rect_data_oct_feb <- tibble(year = seq(min_year, max_year)) |>
  mutate(
    xmin = ymd(paste0(year, "-10-01")),
    xmax = ymd(paste0(year + 1, "-02-01"))
  )

# Combine, set Y bounds, and filter to the actual plot area
rect_data <- bind_rows(rect_data_mar_jul, rect_data_oct_feb) |>
  mutate(ymin = y_min, ymax = y_max) |>
  # Only keep rectangles that are fully or partially within the plot's X range
  filter(
    xmin <= max(ai_clicks_data$date, na.rm = TRUE),
    xmax >= min(ai_clicks_data$date, na.rm = TRUE)
  )

# --- 3. Generate the Final Plot ---
ai_clicks_data |>
  ggplot(aes(x = date, y = n)) +
  # Add the transparent grey rectangles first
  geom_rect(
    data = rect_data,
    aes(xmin = xmin, xmax = xmax, ymin = ymin, ymax = ymax),
    fill = "grey",
    alpha = 0.2,
    inherit.aes = FALSE # Essential to use the rect_data columns
  ) +
  # Then plot the lines and points on top
  geom_line(group = 1) +
  geom_point() +
  theme_minimal() +
  labs(title = "Number of AI transcript clicks per month", x = "date [months]") +
  geom_smooth(method = "loess", se = FALSE, color = "blue", alpha = 0.5)

8.9 Output des LLMs: `llm_response` - Tokens und Tokenlänge

8.9.1 Deutsch vs. Englisch

Show the code

llm_response_text |>
  count(lang) |>
  mutate(prob = n / sum(n))

8.9.2 Anzahl der Tokens

Show the code

llm_response_text |>
  describe_distribution(select = "tokens_n") |>
  print_md()

Variable	Mean	SD	IQR	Range	Skewness	Kurtosis	n	n_Missing
tokens_n	184.14	104.40	155.50	(11.00, 656.00)	0.45	0.31	397	0

8.9.3 Anzahl der Token nach Universitäten

Show the code

llm_response_text_date_course_uni |>
  group_by(university) |>
  describe_distribution(select = "tokens_n")

Show the code

llm_response_text_date_course_uni |>
  ggboxplot(
    x = "university",
    y = "tokens_n",
    add = "mean_se"
  ) +
  theme_minimal() +
  labs(
    title = "Number of tokens in LLM responses by university",
    x = "University",
    y = "Number of tokens"
  ) +
  coord_flip()

8.9.4 Anzahl der Token nach Kursen

Show the code

llm_response_text_date_course_uni |>
  group_by(course) |>
  describe_distribution(select = "tokens_n") |>
  print_md()

course	Variable	Mean	SD	IQR	Range	Skewness	Kurtosis	n	n_Missing
anis2	tokens_n				(Inf, -Inf)			0	43
armeufa	tokens_n				(Inf, -Inf)			0	2
bare	tokens_n	171.00	46.87	91.00	(132.00, 223.00)	1.15	-1.50	3	12
bio	tokens_n				(Inf, -Inf)			0	168
biotech	tokens_n				(Inf, -Inf)			0	127
biovete	tokens_n				(Inf, -Inf)			0	244
cta1	tokens_n	93.38	71.24	72.75	(34.00, 314.00)	2.27	5.78	16	1101
daba	tokens_n	173.00		0.00	(173.00, 173.00)			1	197
enerbi	tokens_n				(Inf, -Inf)			0	10
epsy	tokens_n				(Inf, -Inf)			0	1
etechde	tokens_n				(Inf, -Inf)			0	3
fodesoa	tokens_n	256.00		0.00	(256.00, 256.00)			1	324
fomesoa	tokens_n				(Inf, -Inf)			0	6
fosaq	tokens_n				(Inf, -Inf)			0	60
gdi	tokens_n	185.00	5.00	10.00	(180.00, 190.00)	0.00	-1.50	3	47
gemwesa	tokens_n				(Inf, -Inf)			0	57
gesoa	tokens_n	186.41	126.56	217.00	(13.00, 450.00)	0.36	-1.02	64	2319
kore	tokens_n				(Inf, -Inf)			0	3
mat11akzg	tokens_n				(Inf, -Inf)			0	2
mibio	tokens_n				(Inf, -Inf)			0	206
mioek	tokens_n	156.50	60.10	85.00	(114.00, 199.00)	0.00	-2.00	2	435
nlp	tokens_n	356.50	153.44	217.00	(248.00, 465.00)	0.00	-2.00	2	65
quame1	tokens_n				(Inf, -Inf)			0	3
quanch	tokens_n				(Inf, -Inf)			0	34
softa	tokens_n				(Inf, -Inf)			0	4
thesoa	tokens_n	249.38	73.66	106.00	(15.00, 394.00)	-0.87	1.23	39	273
wirkori	tokens_n				(Inf, -Inf)			0	5
wisoa	tokens_n				(Inf, -Inf)			0	33
zemiws	tokens_n				(Inf, -Inf)			0	1

Show the code

llm_response_text_date_course_uni |>
  ggboxplot(
    x = "course",
    y = "tokens_n",
    add = "mean_se"
  ) +
  theme_minimal() +
  labs(
    title = "Number of tokens in LLM responses by course",
    x = "Course",
    y = "Number of tokens"
  ) +
  coord_flip()

8.9.5 Anzahl vorab existierender Fragen

8.9.5.1 Anzahl `verify_option_wrong` und `verify_option_correct`

8.9.5.1.1 idvisit

Show the code

verify_option_summary <- as.data.table(data_separated_filtered)[,
  .(idvisit, value) # keep only needed columns
][
  value %chin% c("verify_option_wrong", "verify_option_correct"), # filter
  .(verify_option = .N), # summarise count
  by = idvisit
]

verify_option_summary <- as_tibble(verify_option_summary)

Show the code

verify_option_summary <-
  data_separated_filtered |>
  group_by(idvisit) |>
  filter(value == "verify_option_wrong" | value == "verify_option_correct") |>
  summarise(verify_option = n())

Show the code

verify_option_summary |>
  gghistogram(x = "verify_option")

Show the code

verify_option_summary |>
  describe_distribution(verify_option) |>
  print_md()

Variable	Mean	SD	IQR	Range	Skewness	Kurtosis	n	n_Missing
verify_option	35.24	36.76	30	(4.00, 245.00)	2.64	9.11	207	0

8.9.5.1.2 fingerprint unique

Show the code

# verify_option_summary_fingerprint <-
#   data_separated_filtered |>
#   group_by(fingerprint) |>
#   filter(value == "verify_option_wrong" | value == "verify_option_correct") |>
#   summarise(verify_option = n())

setDT(data_separated_filtered) # Ensure your data frame is a data.table

verify_option_summary_fingerprint <- data_separated_filtered[
  # 1. Filtering (i)
  value %in% c("verify_option_wrong", "verify_option_correct"),

  # 2. Summarize (.j) - calculate the count (n)
  .(verify_option = .N),

  # 3. Grouping (by)
  by = .(fingerprint)
]

verify_option_summary_fingerprint <- as_tibble(
  verify_option_summary_fingerprint
)

Show the code

verify_option_summary_fingerprint |>
  gghistogram(x = "verify_option")

Show the code

verify_option_summary_fingerprint |>
  describe_distribution(verify_option) |>
  print_md()

Variable	Mean	SD	IQR	Range	Skewness	Kurtosis	n	n_Missing
verify_option	41.68	46.81	35	(4.00, 252.00)	2.35	5.94	175	0

8.9.5.2 Anzahl `verify_option_wrong` verify_option_div_by_4 - geteilt durch 4

Show the code

verify_option_summary <-
  verify_option_summary |>
  mutate(verify_option_div_by_4 = verify_option / 4)

verify_option_summary |>
  gghistogram(x = "verify_option_div_by_4")

Show the code

verify_option_summary |>
  mutate(verify_option_div_by_4 = verify_option / 4) |>
  describe_distribution(verify_option_div_by_4) |>
  print_md()

Variable	Mean	SD	IQR	Range	Skewness	Kurtosis	n	n_Missing
verify_option_div_by_4	8.81	9.19	7.50	(1.00, 61.25)	2.64	9.11	207	0

8.9.5.3 Anzahl “Multiple choice answer selected”

Show the code

check_if_both_methods_give_same_number <-
  n_mc_answers_selected |>
  full_join(verify_option_summary)

check_if_both_methods_give_same_number |>
  head(20) |>
  gt()

n	verify_option	verify_option_div_by_4
1560
6	28	7.00
1569
2	14	3.50
2021
2	21	5.25
2022
10	126	31.50
2394
2	21	5.25
2718
2	7	1.75
2740
2	7	1.75
2883
2	126	31.50
2902
2	126	31.50
2912
2	77	19.25
2932
6	35	8.75
2950
2	56	14.00
2978
14	245	61.25
2979
4	35	8.75
3103
2	14	3.50
3257
2	7	1.75
3691
2	35	8.75
3700
4	84	21.00
3741
2	21	5.25
3804
2	70	17.50

Nein, beide Methoden liefern nicht die gleiche Zahl.

8.9.5.4 “Multiple choice answer selected” im Zeitverlauf

Show the code

mc_answers_with_timestamps <-
  mc_answers_with_timestamps |>
  mutate(month_start = floor_date(timestamp, "month")) |>
  ungroup() |>
  arrange(timestamp) |>
  mutate(n_cumulated = cumsum(n)) |>
  mutate(date = as.Date(timestamp))

lim <- c(
  floor_date(min(mc_answers_with_timestamps$date), unit = "year"),
  max(mc_answers_with_timestamps$date)
)

rect_data <- comp_semester_rects(mc_answers_with_timestamps, col_date = "date")

mc_answers_with_timestamps |>
  ggplot(aes(x = date, y = n_cumulated)) +
  scale_x_date(limits = lim, labels = scales::label_date_short()) +
  geom_rect(
    data = rect_data,
    aes(xmin = xmin, xmax = xmax, ymin = ymin, ymax = ymax),
    fill = "grey",
    alpha = 0.2,
    inherit.aes = FALSE # Essential to use the rect_data columns
  ) +
  geom_point() +
  geom_line()

8.9.5.5 Anzahl `generate_questionaire`

Show the code

# generate_questionaire_summary <-
#   data_separated_filtered |>
#   group_by(idvisit) |>
#   filter(value == "generate_questionaire") |>
#   summarise(generate_questionaire = n())

setDT(data_separated_filtered) # Convert the data.frame to a data.table in place

generate_questionaire_summary <- data_separated_filtered[
  # 1. Filtering (i)
  value == "generate_questionaire",

  # 2. Summarize (.j) - calculate the count (.N) and rename it
  .(generate_questionaire = .N),

  # 3. Grouping (by)
  by = .(idvisit)
]

Show the code

generate_questionaire_summary |>
  describe_distribution(generate_questionaire) |>
  print_md()

Variable	Mean	SD	IQR	Range	Skewness	Kurtosis	n	n_Missing
generate_questionaire	3.11	5.93	2	(1.00, 66.00)	5.89	46.07	367	0

8.9.5.6 Anzahl vorab existierender Fragen

Show the code

setDT(generate_questionaire_summary)
setDT(verify_option_summary)

# 1. Full Join (Merge)
# Use the 'merge' function with all.x=TRUE and all.y=TRUE for a full join
# Assumes the join column is 'idvisit' as used in your previous examples
prior_existing_questions_summary <- merge(
  generate_questionaire_summary,
  verify_option_summary,
  by = "idvisit",
  all = TRUE
)

# 2. Mutate (Calculation)
# Use .j to create the new column
prior_existing_questions_summary[,
  prior_existing_questions_n := verify_option - generate_questionaire
]

# prior_existing_questions_summary <-
#   generate_questionaire_summary |>
#   full_join(verify_option_summary) |>
#   mutate(prior_existing_questions_n = verify_option - generate_questionaire)

Show the code

prior_existing_questions_summary |>
  # drop_na() |>
  gghistogram(x = "prior_existing_questions_n")

Show the code

prior_existing_questions_summary |>
  describe_distribution(prior_existing_questions_n) |>
  print_md()

Variable	Mean	SD	IQR	Range	Skewness	Kurtosis	n	n_Missing
prior_existing_questions_n	38.87	44.57	39	(-59.00, 236.00)	1.98	5.40	91	392

8.10 Input zum LLM: `message_to_llm` - Tokens und Tokenlänge

Show the code

prompt_length |>
  head()

Show the code

prompt_length |>
  describe_distribution(token_length)

Show the code

prompt_length |>
  ggplot(aes(x = token_length)) +
  geom_histogram(binwidth = 10) +
  labs(
    title = "Length of prompts sent to the LLM",
    x = "Prompt length (in tokens)",
    y = "Number of prompts"
  ) +
  theme_minimal()

8.1 Setup

8.2 Interaktion mit dem LLM

8.2.1 Art und Anzahl der Interaktionen mit dem LLM

8.3 Anzahl der message_to_llm

8.3.1 Verteilung

8.3.2 Diagramm

8.4 Anteil Visitors, die mit dem LLM interagieren

8.4.1 idvisit

8.4.2 fingerprint unique

8.4.3 … Im Zeitverlauf

8.4.3.1 Absolutzahlen

8.4.3.2 Anteile

8.5 Länge der Prompts (Input an das LLM)

8.5.1 Überblick

8.5.2 Token-Länge nach Universitäten

8.5.3 Token-Länge nach Kursen

8.5.4 Token-Länge im Zeitverlauf

8.6 Anzahl der Interaktionen bei den Usern, die mit dem LLM interagieren

8.6.1 Insgesamt

8.6.2 Interaktionen mit dem LLM - pro Kurs und pro Uni

8.6.3 Interaktionen mit dem LLM - im Zeitverlauf

8.7 Klick auf ein Wort im Transkript

8.7.1 Insgesamt

8.7.2 Im Zeitverlauf

8.7.2.1 idvisit

8.7.2.2 fingerprint unique

8.8 KI-Aktionen

8.8.1 Insgesamt (ganzer Zeitraum)

8.8.2 Im Detail

8.8.3 KI-Klicks pro Monat

8.9 Output des LLMs: llm_response - Tokens und Tokenlänge

8.9.1 Deutsch vs. Englisch

8.9.2 Anzahl der Tokens

8.9.3 Anzahl der Token nach Universitäten

8.9.4 Anzahl der Token nach Kursen

8.9.5 Anzahl vorab existierender Fragen

8.9.5.1 Anzahl verify_option_wrong und verify_option_correct

8.9.5.1.1 idvisit

8.9.5.1.2 fingerprint unique

8.9.5.2 Anzahl verify_option_wrong verify_option_div_by_4 - geteilt durch 4

8.9.5.3 Anzahl “Multiple choice answer selected”

8.9.5.4 “Multiple choice answer selected” im Zeitverlauf

8.9.5.5 Anzahl generate_questionaire

8.9.5.6 Anzahl vorab existierender Fragen

8.10 Input zum LLM: message_to_llm - Tokens und Tokenlänge

8.3 Anzahl der `message_to_llm`

8.9 Output des LLMs: `llm_response` - Tokens und Tokenlänge

8.9.5.1 Anzahl `verify_option_wrong` und `verify_option_correct`

8.9.5.2 Anzahl `verify_option_wrong` verify_option_div_by_4 - geteilt durch 4

8.9.5.5 Anzahl `generate_questionaire`

8.10 Input zum LLM: `message_to_llm` - Tokens und Tokenlänge