Show the code
library(targets)
library(tidyverse)
library(ggokabeito)
library(easystats)
library(gt)
library(ggfittext)
library(scales)
library(visdat)
library(collapse)
library(ggpubr)
library(knitr)
library(tinytable)
library(data.table)
Dieser Arbeitsbericht schildert das technische Vorgehen im Rahmen der Analyse der Matomo-Daten des BMBF-Projekt “HaNS”.
Die Matomo-Klickdaten aller Semester der Projektlaufzeit wurden für diese Analyse verarbeitet. Mit Hilfe einer R-Pipeline wurden eine Reihe von Forschungsfragen analysiert.
Der komplette Code ist online dokumentiert unter https://github.com/sebastiansauer/hans. Aus Datenschutzgründen sind online keine Daten eingestellt.
Die zentrale Analyse-Pipeline-Datei ist https://github.com/sebastiansauer/hans/blob/main/_targets.R.
library(targets)
library(tidyverse)
library(ggokabeito)
library(easystats)
library(gt)
library(ggfittext)
library(scales)
library(visdat)
library(collapse)
library(ggpubr)
library(knitr)
library(tinytable)
library(data.table)
library(ggplot2)
theme_set(theme_minimal())
options(lubridate.week.start = 1) # Monday as first day
#options(collapse_mask = "all") # use collapse for all dplyr operations
options(chromote.headless = "new") # Chrome headleass needed for gtsave
<- function(...) scale_colour_brewer(palette = "Set2")
scale_colour_discrete <- function(...) scale_fill_brewer(palette = "Set2") scale_fill_discrete
tar_load(ai_transcript_clicks_per_month)
tar_load(config)
tar_load(course_and_uni_per_visit)
tar_load(data_all_fct)
tar_load(data_long)
tar_load(data_prepped)
tar_load(data_separated_distinct_slice)
tar_load(data_separated_filtered)
#tar_load(data_users_only)
tar_load(idvisit_has_llm)
tar_load(llm_response_text)
tar_load(n_action)
tar_load(n_action_type)
tar_load(n_action_w_date)
tar_load(time_duration)
tar_load(time_since_last_visit)
tar_load(time_spent)
tar_load(time_spent_w_course_university)
tar_load(time_visit_wday)
tar_load(n_mc_answers_selected)
tar_load(mc_answers_with_timestamps)
tar_load(n_action_fingerprint)
tar_load(time_visit_wday_fingerprint)
tar_load(n_action_w_date_fingerprint)
tar_load(time_spent_fingerprint)
Die Analyse wird im Rahmen einer Targets-Pipeline beschrieben und ist offen auf Github einsehbar.
Aufgrund des “rechts flatternden” Datenformat (d.h. unterschiedliche Zeilenlängen) wurden die Daten in ein Langformat überführt, zwecks besserer/einfacherer Analyse.
Dazu wurden (neben den ID-Variablen, v.a. idvisit
) die actionDetails_
-Variablen verwendet. Der Code des Pivotierens in das Langformat ist in der Funktion longify-data.R einsehbar.
Die Daten im Langformat wurden dann noch etwas aufbereitet mt der Funktion slimify-data.R.
|>
data_separated_filtered head(30)
Der Roh-Datensatz verfügt über
Jede Zeile entspricht einem “Visit”.
<-
data_all_fct_head100 %>%
data_all_fct select(1:100) %>%
slice_head(n = 100)
<- data.frame(
d_na_cols id = 1:ncol(data_all_fct),
names = names(data_all_fct),
na_prop = colMeans(is.na(data_all_fct))
)
|>
d_na_cols filter(na_prop == 1)
<-
no_na_cols |>
d_na_cols filter(na_prop > .9) |>
nrow()
no_na_cols
[1] 1951
Sehr viele Spalten, 1951 sind fast leer.
<- data.frame(
d_100_names id = 1:100,
col_name = data_all_fct_head100 %>% names()
)
d_100_names
data_all_fct_head100
%>%
data_separated_filtered slice(1:100)
Entfernt man Developer, Admins und Lecturers aus dem Roh-Datensatz so bleiben weniger Zeilen übrig:
|> dim() n_action
[1] 14207 2
|>
n_action head(30)
|> head(30) n_action_fingerprint
|>
n_action_w_date head(30)
<-
min_max_time |>
n_action_w_date summarise(
time_min = min(date_time_start, na.rm = T),
time_max = max(date_time_start, na.rm = T)
)
|>
min_max_time gt()
time_min | time_max |
---|---|
2022-12-05 15:33:45 | 2025-07-14 23:40:45 |
Erster Visit im Datensatz: 2022-12-05 15:33:45.
Letzter Visit im Datensatz: 2025-07-14 23:40:45.
Diese Statistik wurde auf Basis des Datenobjekts data_separated_filtered
berechnet, vgl. das Target dieses Objekts in der Pipeline.
|>
time_visit_wday head(30)
<-
time_since_last_visit |>
time_since_last_visit mutate(dayssincelastvisit = as.numeric(dayssincelastvisit)) |>
distinct(idvisit, .keep_all = TRUE)
|>
time_since_last_visit ::describe_distribution(dayssincelastvisit) |>
datawizard::kable(digits = 2) knitr
Variable | Mean | SD | IQR | Min | Max | Skewness | Kurtosis | n | n_Missing |
---|---|---|---|---|---|---|---|---|---|
dayssincelastvisit | 6.89 | 15.75 | 0 | 1 | 87 | 2.98 | 8.26 | 14207 | 0 |
|>
time_since_last_visit ggplot(aes(x = dayssincelastvisit)) +
geom_density() +
labs(
title = "If visitor return, they return mostly not later than a few days."
)
Die Nutzer nutzen die Seite in Abständen von wenigen Tagen?
|> head() time_visit_wday_fingerprint
<-
time_since_last_visit_fingerprint |>
time_since_last_visit mutate(dayssincelastvisit = as.numeric(dayssincelastvisit)) |>
distinct(fingerprint, .keep_all = TRUE)
|>
time_since_last_visit ::describe_distribution(dayssincelastvisit) |>
datawizard::kable(digits = 2) knitr
Variable | Mean | SD | IQR | Min | Max | Skewness | Kurtosis | n | n_Missing |
---|---|---|---|---|---|---|---|---|---|
dayssincelastvisit | 6.89 | 15.75 | 0 | 1 | 87 | 2.98 | 8.26 | 14207 | 0 |
|>
time_since_last_visit ggplot(aes(x = dayssincelastvisit)) +
geom_density() +
labs(
title = "If visitor return, they return mostly not later than a few days."
)
<-
time_since_last_visit_per_course |>
time_since_last_visit left_join(course_and_uni_per_visit) |>
drop_na()
<-
time_since_last_visit_per_course_summary |>
time_since_last_visit_per_course group_by(course) |>
summarise(
dayssincelastvisit_mean = mean(dayssincelastvisit),
dayssincelastvisit_sd = sd(dayssincelastvisit),
dayssincelastvisit_n = n()
|>
) mutate(
dayssincelastvisit_n_log = log(dayssincelastvisit_n, base = 10) + 0.001
)
time_since_last_visit_per_course_summary
|>
time_since_last_visit_per_course_summary ggplot(aes(
y = reorder(course, dayssincelastvisit_mean),
x = dayssincelastvisit_mean
+
)) geom_errorbar(aes(
xmin = dayssincelastvisit_mean - dayssincelastvisit_sd,
xmax = dayssincelastvisit_mean + dayssincelastvisit_sd
+
)) geom_point(aes(alpha = log(dayssincelastvisit_n)), show.legend = FALSE) +
labs(
x = "Days since last visit (mean±sd)",
y = "course",
title = "In some courses, users use HaNS frequently.",
caption = "Grey saturation of the mean dots refers to the log10 of the sample size (N)"
+
) geom_text(
aes(label = round(dayssincelastvisit_n)),
x = Inf,
hjust = 1.2,
size = 2
+
) annotate(
x = Inf,
y = Inf,
label = "N",
geom = "label",
hjust = 1,
vjust = 1
+
) scale_y_discrete(expand = expansion(mult = 0.1)) +
theme_minimal()
Wie viele Visits (von Hans) gab es?
<-
time_visit_wday_summary |>
time_visit_wday ungroup() |>
mutate(month_start = floor_date(date_time, "month")) |>
mutate(
month_name = month(date_time, label = TRUE, abbr = FALSE),
month_num = month(date_time, label = FALSE),
year_num = year(date_time)
)
|>
time_visit_wday_summary group_by(year_num, month_num) |>
summarise(n = n()) |>
gt()
month_num | n |
---|---|
2022 | |
12 | 329 |
2023 | |
1 | 455 |
2 | 561 |
3 | 149 |
4 | 253 |
5 | 391 |
6 | 292 |
7 | 441 |
8 | 26 |
9 | 39 |
10 | 614 |
11 | 660 |
12 | 519 |
2024 | |
1 | 783 |
2 | 85 |
3 | 138 |
4 | 329 |
5 | 413 |
6 | 593 |
7 | 743 |
8 | 16 |
9 | 23 |
10 | 731 |
11 | 918 |
12 | 765 |
2025 | |
1 | 959 |
2 | 155 |
3 | 507 |
4 | 1011 |
5 | 557 |
6 | 321 |
7 | 430 |
NA | |
NA | 1 |
<-
time_visit_wday_summary_fingerprint |>
time_visit_wday_fingerprint ungroup() |>
mutate(month_start = floor_date(date_time, "month")) |>
mutate(
month_name = month(date_time, label = TRUE, abbr = FALSE),
month_num = month(date_time, label = FALSE),
year_num = year(date_time)
)
|>
time_visit_wday_summary_fingerprint group_by(year_num, month_num) |>
summarise(n = n()) |>
gt()
month_num | n |
---|---|
2022 | |
12 | 235 |
2023 | |
1 | 248 |
2 | 303 |
3 | 99 |
4 | 160 |
5 | 226 |
6 | 195 |
7 | 227 |
8 | 17 |
9 | 23 |
10 | 402 |
11 | 412 |
12 | 325 |
2024 | |
1 | 445 |
2 | 50 |
3 | 94 |
4 | 179 |
5 | 204 |
6 | 274 |
7 | 214 |
8 | 10 |
9 | 16 |
10 | 365 |
11 | 417 |
12 | 317 |
2025 | |
1 | 347 |
2 | 74 |
3 | 217 |
4 | 424 |
5 | 273 |
6 | 171 |
7 | 196 |
NA | |
NA | 1 |
|>
time_visit_wday_summary_fingerprint group_by(year_num, month_start) |>
summarise(n = n()) |>
ggplot(aes(x = month_start, y = n)) +
geom_line(group = 1, color = "grey60") +
geom_point() +
labs(
title = "The number of visits reflect the teaching periods of the semesters.",
x = "month/year"
)
<-
time_visit_wday_summary_week |>
time_visit_wday ungroup() |>
mutate(week_start = floor_date(date_time, "week")) |>
mutate(week_num = week(date_time), year_num = year(date_time))
<-
time_visit_wday_summary_week_summarized |>
time_visit_wday_summary_week group_by(year_num, week_num) |>
summarise(n = n())
time_visit_wday_summary_week_summarized
<-
time_visit_wday_summary_week_summarized_dateformat |>
time_visit_wday_summary_week group_by(week_start) |>
summarise(n = n())
|>
time_visit_wday_summary_week_summarized_dateformat ggplot(aes(x = week_start, y = n)) +
geom_line(group = 1, color = "grey60") +
geom_point() +
geom_smooth(method = "gam", se = FALSE, color = "blue") +
labs(
title = "The number of visits is increasing and reflects the teaching periods of the semesters.",
x = "week number/year"
)
The number of visits has increased over time.
|>
time_visit_wday_summary group_by(year_num, month_start) |>
summarise(n = n()) |>
ungroup() |>
mutate(n_cumsum = cumsum(n)) |>
ggplot(aes(x = month_start, y = n_cumsum)) +
geom_line(group = 1, color = "grey60") +
geom_point() +
theme_minimal() +
geom_smooth(method = "lm") +
labs(title = "Visits have increased linearly over time.", x = "month/year")
|>
time_visit_wday_summary_fingerprint group_by(year_num, month_start) |>
summarise(n = n()) |>
ungroup() |>
mutate(n_cumsum = cumsum(n)) |>
ggplot(aes(x = month_start, y = n_cumsum)) +
geom_line(group = 1, color = "grey60") +
geom_point() +
theme_minimal() +
geom_smooth(method = "lm") +
labs(title = "Visits have increased linearly over time.", x = "month/year")
|>
time_visit_wday_summary_week group_by(year_num, week_start) |>
summarise(n = n()) |>
ungroup() |>
mutate(n_cumsum = cumsum(n)) |>
ggplot(aes(x = week_start, y = n_cumsum)) +
geom_line(group = 1, color = "grey60") +
geom_point() +
theme_minimal() +
geom_smooth(method = "lm") +
labs(
title = "Visits have increased approx. linearly over time.",
x = "week/year"
)
Die folgenden Statistiken beruhen auf dem Datensatz data_separated_filtered
:
glimpse(data_separated_filtered)
Rows: 4,477,584
Columns: 5
$ nr <int> 0, 0, 1, 1, 1, 1, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5…
$ type <fct> subtitle, timestamp, eventcategory, eventaction, timestamp…
$ value <fct> "https://hans.th-nuernberg.de/", "2023-03-23 18:37:56", "c…
$ idvisit <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
$ fingerprint <fct> aa8a78771b4f21ff, aa8a78771b4f21ff, aa8a78771b4f21ff, aa8a…
nr
fasst die Nummer der Aktion innerhalb eines bestimmten Visits.
|>
data_separated_filtered distinct(fingerprint, .keep_all = TRUE) |>
glimpse()
Rows: 7,160
Columns: 5
$ nr <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
$ type <fct> subtitle, subtitle, subtitle, subtitle, subtitle, subtitle…
$ value <fct> "https://hans.th-nuernberg.de/", "https://hans.th-nuernber…
$ idvisit <int> 1, 3, 6, 7, 8, 10, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24,…
$ fingerprint <fct> aa8a78771b4f21ff, 1f026ad3cbbdf325, 518965d4e1ae7e2d, aa95…
<-
tbl_n_action |>
n_action describe_distribution(nr_max, centrality = c("median", "mean"))
tbl_n_action
nr_max
gibt den Maximalwert von nr
zurück, sagt also, wie viele Aktionen maximal während eines Vitis ausgeführt wurden.
Betrachtet man die Anzahl der Aktionen pro Visit näher, so fällt auf, dass der Maximalwert (499) sehr häufig vorkommt:
|>
n_action count(nr_max) |>
ggplot(aes(x = nr_max, y = n)) +
geom_col() +
geom_vline(
xintercept = tbl_n_action$Median,
color = "blue",
linetype = "dashed"
+
) labs(
caption = "Vertical dashed lines shows the median.",
title = "Most users to only a few actions, but some do many."
)
Die meisten Nutzer machen nur wenige Aktionen pro Visit, aber einige machen sehr viele.
Hier noch in einer anderen Darstellung:
Der Maximalwert ist einfach auffällig häufig:
|>
n_action count(nr_max == 499) |>
gt()
nr_max == 499 | n |
---|---|
FALSE | 13626 |
TRUE | 581 |
Es erscheint plausibel, dass der Maximalwert alle “gekappten” (zensierten, abgeschnittenen) Werte fasst, also viele Werte, die eigentlich größer wären (aber dann zensiert wurden).
<-
tbl_n_action_fingerprint |>
n_action_fingerprint describe_distribution(nr_max, centrality = c("median", "mean"))
tbl_n_action_fingerprint
|>
n_action_fingerprint count(nr_max) |>
ggplot(aes(x = nr_max, y = n)) +
geom_col() +
geom_vline(
xintercept = tbl_n_action_fingerprint$Median,
color = "blue",
linetype = "dashed"
+
) labs(
caption = "Vertical dashed lines shows the median.",
title = "Most users to only a few actions, but some do many."
)
<-
n_action_lt_500 |>
n_action filter(nr_max != 499)
|>
n_action_lt_500 describe_distribution(nr_max) |>
gt() |>
fmt_number(columns = where(is.numeric), decimals = 2)
Variable | Mean | SD | IQR | Min | Max | Skewness | Kurtosis | n | n_Missing |
---|---|---|---|---|---|---|---|---|---|
nr_max | 61.88 | 88.53 | 77.00 | 1.00 | 496.00 | 2.27 | 5.47 | 13,626.00 | 0.00 |
<-
n_action_lt_500_fingerprint |>
n_action_fingerprint filter(nr_max != 499)
|>
n_action_lt_500_fingerprint describe_distribution(nr_max) |>
gt() |>
fmt_number(columns = where(is.numeric), decimals = 2)
Variable | Mean | SD | IQR | Min | Max | Skewness | Kurtosis | n | n_Missing |
---|---|---|---|---|---|---|---|---|---|
nr_max | 75.78 | 99.73 | 100.00 | 1.00 | 496.00 | 1.88 | 3.31 | 6,771.00 | 0.00 |
|>
course_and_uni_per_visit count(university)
|>
time_spent_w_course_university count(year, course)
<-
n_actions_searches_interactions |>
data_prepped select(
idvisit,
fingerprint,any_of(c(
"searches",
"actions",
"interactions",
"referrertype",
"referrername",
"language",
"devicetype",
"devicemodel",
"operatingsystem",
"browsername"
)) )
Auswertung - der Anzahlen der uniquen visitids und uniquen Fingerprints - Mittelwerte der Anzahl der Suchen und Aktionen pro Besuch
|>
n_actions_searches_interactions as.data.frame() |>
summarise(
idvisit_n = length(unique(idvisit)),
fingerprint_n = length(unique(fingerprint)),
actions_mean = mean(as.integer(actions), na.rm = TRUE),
searches_mean = mean(as.integer(searches), na.rm = TRUE)
)
Es gibt etwa doppelt so viele Besucher wie unique Nutzer.
|>
n_actions_searches_interactions count(referrertype, sort = TRUE)
|>
n_actions_searches_interactions count(referrername, sort = TRUE)
|>
n_actions_searches_interactions count(devicemodel, sort = TRUE) |>
slice_head(n = 10)
|>
n_actions_searches_interactions count(operatingsystem, sort = TRUE) |>
slice_head(n = 10)
|>
n_actions_searches_interactions count(browsername, sort = TRUE) |>
slice_head(n = 10)
Die Mac-User scheinen besonders aktiv zu sein auf HaNS.
= mean(n_action$nr_max) |> round(0)
n_action_avg = median(n_action$nr_max) |> round(0)
n_action_median = sd(n_action$nr_max) |> round(0)
n_action_sd = IQR(n_action$nr_max) |> round(0)
n_action_iqr
|>
n_action ggplot() +
geom_histogram(aes(x = nr_max)) +
labs(
x = "Anzahl von Aktionen pro Visit",
y = "n",
caption = "Der vertikale Strich zeigt den Mittelwert; der horizontale MW±SD"
+
) theme_minimal() +
geom_vline(xintercept = n_action_avg, color = palette_okabe_ito()[1]) +
geom_segment(
x = n_action_avg - n_action_sd,
y = 0,
xend = n_action_avg + n_action_sd,
yend = 0,
color = palette_okabe_ito()[2],
size = 2
+
) annotate(
"label",
x = n_action_avg,
y = 1500,
label = paste0("MW = ", n_action_avg)
+
) annotate(
"label",
x = n_action_avg + n_action_sd,
y = 0,
label = paste0("SD = ", n_action_sd)
)
#geom_label(aes(x = n_action_avg), y = 1, label = "Mean")
|>
n_action ggplot() +
geom_histogram(aes(x = nr_max)) +
labs(
x = "Anzahl von Aktionen pro Visit",
y = "n",
caption = "Der vertikale Strich zeigt den Median; der horizontale Median±IQR"
+
) theme_minimal() +
geom_vline(xintercept = n_action_median, color = palette_okabe_ito()[1]) +
geom_segment(
x = n_action_median - n_action_iqr,
y = 0,
xend = n_action_median + n_action_iqr,
yend = 0,
color = palette_okabe_ito()[2],
size = 2
+
) annotate(
"label",
x = n_action_median,
y = 1500,
label = paste0("Md = ", n_action_median)
+
) annotate(
"label",
x = n_action_median + n_action_iqr,
y = 0,
label = paste0("IQR = ", n_action_iqr)
)
#geom_label(aes(x = n_action_avg), y = 1, label = "Mean")
= mean(n_action_fingerprint$nr_max) |> round(0)
n_action_fingerprint_avg = median(n_action_fingerprint$nr_max) |> round(0)
n_action_fingerprint_median = sd(n_action_fingerprint$nr_max) |> round(0)
n_action_fingerprint_sd = IQR(n_action_fingerprint$nr_max) |> round(0)
n_action_fingerprint_iqr
|>
n_action_fingerprint ggplot() +
geom_histogram(aes(x = nr_max)) +
labs(
x = "Anzahl von Aktionen pro Visit",
y = "n",
caption = "Der vertikale Strich zeigt den Mittelwert; der horizontale MW±SD"
+
) theme_minimal() +
geom_vline(
xintercept = n_action_fingerprint_avg,
color = palette_okabe_ito()[1]
+
) geom_segment(
x = n_action_fingerprint_avg - n_action_fingerprint_sd,
y = 0,
xend = n_action_fingerprint_avg + n_action_fingerprint_sd,
yend = 0,
color = palette_okabe_ito()[2],
size = 2
+
) annotate(
"label",
x = n_action_fingerprint_avg,
y = 1500,
label = paste0("MW = ", n_action_fingerprint_avg)
+
) annotate(
"label",
x = n_action_fingerprint_avg + n_action_fingerprint_sd,
y = 0,
label = paste0("SD = ", n_action_fingerprint_sd)
)
#geom_label(aes(x = n_action_fingerprint_avg), y = 1, label = "Mean")
|>
n_action_fingerprint ggplot() +
geom_histogram(aes(x = nr_max)) +
labs(
x = "Anzahl von Aktionen pro Visit",
y = "n",
caption = "Der vertikale Strich zeigt den Median; der horizontale Median±IQR"
+
) theme_minimal() +
geom_vline(
xintercept = n_action_fingerprint_median,
color = palette_okabe_ito()[1]
+
) geom_segment(
x = n_action_fingerprint_median - n_action_fingerprint_iqr,
y = 0,
xend = n_action_fingerprint_median + n_action_fingerprint_iqr,
yend = 0,
color = palette_okabe_ito()[2],
size = 2
+
) annotate(
"label",
x = n_action_fingerprint_median,
y = 1500,
label = paste0("Md = ", n_action_fingerprint_median)
+
) annotate(
"label",
x = n_action_fingerprint_median + n_action_fingerprint_iqr,
y = 0,
label = paste0("IQR = ", n_action_fingerprint_iqr)
)
#geom_label(aes(x = n_action_fingerprint_avg), y = 1, label = "Mean")
= mean(n_action_lt_500$nr_max) |> round(0)
n_action_avg2 = sd(n_action_lt_500$nr_max) |> round(2)
n_action_sd2
|>
n_action_lt_500 ggplot() +
geom_histogram(aes(x = nr_max)) +
labs(
x = "Anzahl von Aktionen pro Visit",
y = "n",
title = "Verteilung der User-Aktionen pro Visit",
caption = "Der vertikale Strich zeigt den Mittelwert; der horizontale die SD"
+
) theme_minimal() +
geom_vline(xintercept = n_action_avg2, color = palette_okabe_ito()[1]) +
geom_segment(
x = n_action_avg - n_action_sd2,
y = 0,
xend = n_action_avg2 + n_action_sd2,
yend = 0,
color = palette_okabe_ito()[2],
size = 2
+
) annotate(
"label",
x = n_action_avg2,
y = 1500,
label = paste0("MW = ", n_action_avg2)
+
) annotate(
"label",
x = n_action_avg2 + n_action_sd2,
y = 0,
label = paste0("SD = ", n_action_sd2)
)
#geom_label(aes(x = n_action_avg), y = 1, label = "Mean")
= mean(n_action_lt_500_fingerprint$nr_max) |> round(0)
n_action_fingerprint_avg2 = sd(n_action_lt_500_fingerprint$nr_max) |> round(2)
n_action_fingerprint_sd2
|>
n_action_lt_500_fingerprint ggplot() +
geom_histogram(aes(x = nr_max)) +
labs(
x = "Anzahl von Aktionen pro Visit",
y = "n",
title = "Verteilung der User-Aktionen pro Visit",
caption = "Der vertikale Strich zeigt den Mittelwert; der horizontale die SD"
+
) theme_minimal() +
geom_vline(
xintercept = n_action_fingerprint_avg2,
color = palette_okabe_ito()[1]
+
) geom_segment(
x = n_action_fingerprint_avg - n_action_fingerprint_sd2,
y = 0,
xend = n_action_fingerprint_avg2 + n_action_fingerprint_sd2,
yend = 0,
color = palette_okabe_ito()[2],
size = 2
+
) annotate(
"label",
x = n_action_fingerprint_avg2,
y = 1500,
label = paste0("MW = ", n_action_fingerprint_avg2)
+
) annotate(
"label",
x = n_action_fingerprint_avg2 + n_action_fingerprint_sd2,
y = 0,
label = paste0("SD = ", n_action_fingerprint_sd2)
)
#geom_label(aes(x = n_action_avg), y = 1, label = "Mean")
|>
n_action_w_date ggplot(aes(x = month_date, y = nr_max)) +
stat_summary(fun = mean, geom = "point", size = 2) +
stat_summary(
fun.data = mean_sdl,
fun.args = list(mult = 1),
geom = "errorbar",
width = 0.2
+
) geom_smooth(method = "lm") +
labs(title = "The number of actions per visit has incresed over time")
|>
n_action_w_date_fingerprint ggplot(aes(x = month_date, y = nr_max)) +
stat_summary(fun = mean, geom = "point", size = 2) +
stat_summary(
fun.data = mean_sdl,
fun.args = list(mult = 1),
geom = "errorbar",
width = 0.2
+
) geom_smooth(method = "lm") +
labs(title = "The number of actions per visit has incresed over time")
lm(nr_max ~ month_date, data = n_action_w_date)
Call:
lm(formula = nr_max ~ month_date, data = n_action_w_date)
Coefficients:
(Intercept) month_date
-5.956e+02 3.937e-07
lm(nr_max ~ month_date, data = n_action_w_date_fingerprint)
Call:
lm(formula = nr_max ~ month_date, data = n_action_w_date_fingerprint)
Coefficients:
(Intercept) month_date
-1.186e+03 7.503e-07
|>
n_action_w_date mutate(week_date = as.Date(week_date)) |>
ggplot(aes(x = week_date, y = nr_max)) +
stat_summary(fun = mean, geom = "point", size = 2) +
stat_summary(fun.data = mean_sdl, geom = "errorbar", width = 0.2) +
geom_smooth(method = "lm") +
labs(title = "The number of actions per visit has incresed over time")
|>
n_action_w_date_fingerprint mutate(week_date = as.Date(week_date)) |>
ggplot(aes(x = week_date, y = nr_max)) +
stat_summary(fun = mean, geom = "point", size = 2) +
stat_summary(fun.data = mean_sdl, geom = "errorbar", width = 0.2) +
geom_smooth(method = "lm") +
labs(title = "The number of actions per fingerprint has incresed over time")
lm(nr_max ~ week_date, data = n_action_w_date)
Call:
lm(formula = nr_max ~ week_date, data = n_action_w_date)
Coefficients:
(Intercept) week_date
-5.93e+02 3.92e-07
lm(nr_max ~ week_date, data = n_action_w_date_fingerprint)
Call:
lm(formula = nr_max ~ week_date, data = n_action_w_date_fingerprint)
Coefficients:
(Intercept) week_date
-1.178e+03 7.453e-07
<-
n_action_lt_500 |>
n_action_lt_500 mutate(
n_actions_type = case_when(
< 30 ~ "glimpser",
nr_max < 300 ~ "serious user",
nr_max TRUE ~ "heavy user"
) )
|>
n_action_lt_500 count(n_actions_type) |>
gt()
n_actions_type | n |
---|---|
glimpser | 7388 |
heavy user | 465 |
serious user | 5773 |
<-
n_action_lt_500_fingerprint |>
n_action_lt_500_fingerprint mutate(
n_actions_type = case_when(
< 30 ~ "glimpser",
nr_max < 300 ~ "serious user",
nr_max TRUE ~ "heavy user"
) )
|>
n_action_lt_500_fingerprint count(n_actions_type) |>
gt()
n_actions_type | n |
---|---|
glimpser | 3269 |
heavy user | 334 |
serious user | 3168 |
|>
n_action_w_date group_by(month_date) |>
count(nr_max) |>
mutate(
n_actions_type = case_when(
< 30 ~ "glimpser",
nr_max < 300 ~ "serious user",
nr_max TRUE ~ "heavy user"
)|>
) count(n_actions_type) |>
ggplot(aes(
x = month_date,
y = n,
color = n_actions_type,
group = n_actions_type
+
)) geom_point() +
geom_line()
|>
n_action_w_date_fingerprint group_by(month_date) |>
count(nr_max) |>
mutate(
n_actions_type = case_when(
< 30 ~ "glimpser",
nr_max < 300 ~ "serious user",
nr_max TRUE ~ "heavy user"
)|>
) count(n_actions_type) |>
ggplot(aes(
x = month_date,
y = n,
color = n_actions_type,
group = n_actions_type
+
)) geom_point() +
geom_line()
Die Verweildauer wurde berechnet als Differenz zwischen kleinstem und größtem Datumszeitwert (POSixct) eines Visits (also pro Wert der Variablen idvisit
), vgl. [Funktion diff_time](https://github.com/sebastiansauer/hans/blob/main/funs/diff_time.R). Diese Variable heißt
time_diffim Objekt
time_spent`.
Dabei wird das Objekt data_separated_filtered
herangezogen, vgl. die Definition es Targets “time_spent” in der Targets-Pipeline.
Die Visit-Zeit wurde auf 600 Min. trunkiert/begrenzt.
|>
time_spent head(30)
<-
time_spent |>
time_spent # compute time (t) in minutes (min):
mutate(t_minutes = as.numeric(time_diff, units = "mins")) |>
filter(t_minutes < 600)
|>
time_spent_fingerprint head(30)
<-
time_spent_fingerprint |>
time_spent_fingerprint # compute time (t) in minutes (min):
mutate(t_minutes = as.numeric(time_diff, units = "mins")) |>
filter(t_minutes < 600)
Die Verweildauer ist im Folgenden dargestellt auf Grundlage oben dargestellter Berechnungsgrundlage (in Sekunden).
|>
time_spent summarise(
mean_time_diff = round(mean(time_diff), 2),
sd_time_diff = sd(time_diff),
min_time_diff = min(time_diff), # shortest duration
max_time_diff = max(time_diff) # longest
)
|>
time_spent_fingerprint summarise(
mean_time_diff = round(mean(time_diff), 2),
sd_time_diff = sd(time_diff),
min_time_diff = min(time_diff), # shortest duration
max_time_diff = max(time_diff) # longest
)
visitduration
Alternativ zur Berechnung der Verweildauer steht eine Variable, visitduration
zur Verfügung, die (offenbar) die Dauer des Visits misst bzw. messen soll.
Allerdings resultieren substanziell andere Werte, wenn man diese Variable heranzieht zur Berechnung der Verweildauer, vgl. Target time_duration
in der Targets-Pipeline.
|>
time_duration head(30)
|>
time_duration summarise(duration_sec_avg = mean(visitduration_sec, na.rm = TRUE)) |>
mutate(duration_min_avg = duration_sec_avg / 60)
|>
time_duration distinct(idvisit, .keep_all = TRUE) |>
summarise(duration_sec_avg = mean(visitduration_sec, na.rm = TRUE)) |>
mutate(duration_min_avg = duration_sec_avg / 60)
|>
time_duration distinct(fingerprint, .keep_all = TRUE) |>
summarise(duration_sec_avg = mean(visitduration_sec, na.rm = TRUE)) |>
mutate(duration_min_avg = duration_sec_avg / 60)
|>
time_spent mutate(time_diff_minutes = time_length(time_diff, unit = "minute")) |>
summarise(
mean_time_diff = round(mean(time_diff_minutes), 2),
sd_time_diff = sd(time_diff_minutes),
min_time_diff = min(time_diff_minutes), # shortest duration
max_time_diff = max(time_diff_minutes) # longest
)
<- ggpubr::ttheme(
small_padding_theme tbody.style = tbody_style(size = 8), # Smaller font size can help
colnames.style = colnames_style(size = 9, face = "bold"),
padding = unit(c(2, 2), "mm") # Reduce horizontal and vertical padding
)
::ggtexttable(
ggpubr
time_spent_summary,rows = NULL,
theme = small_padding_theme
)
<-
time_spent2 |>
time_spent filter(time_diff > 1, time_diff < 120)
|>
time_spent2 ggplot(aes(x = time_diff)) +
geom_histogram(binwidth = 10) +
theme_minimal() +
labs(
y = "n",
x = "Verweildauer in HaNS pro Visit in Minuten",
title = "Verweildauer begrenzt auf 1-120 Minuten",
caption = "bindwidth = 10 Min."
)
Die Einheit von time_spent
ist Sekunden.
<-
time_spent_by_month |>
time_spent mutate(month_start = floor_date(time_min, "month")) |>
mutate(
month_name = month(month_start, label = TRUE, abbr = FALSE),
month_num = month(month_start, label = FALSE),
year = year(month_start)
|>
) group_by(month_num, year) |>
summarise(
time_spent_month_avg = mean(time_diff, na.rm = TRUE),
time_spent_month_sd = sd(time_diff, na.rm = TRUE)
|>
) arrange(year, month_num)
time_spent_by_month
|>
time_spent_by_month mutate(
time_spent_month_avg = round(time_spent_month_avg, 2),
time_spent_month_sd = round(time_spent_month_sd, 2)
|>
) ggtexttable()
<-
time_spent_by_month_name |>
time_spent mutate(month_start = floor_date(time_min, "month")) |>
mutate(
month_name = month(month_start, label = TRUE, abbr = FALSE),
month_num = month(month_start, label = FALSE),
year = year(month_start)
|>
) group_by(month_start, year) |>
summarise(
time_spent_month_avg = mean(time_diff, na.rm = TRUE),
time_spent_month_sd = sd(time_diff, na.rm = TRUE)
)
|>
time_spent_by_month_name ggplot(aes(x = month_start, y = time_spent_month_avg)) +
geom_line(group = 1, color = "grey60") +
geom_point()
<-
time_spent_by_year |>
time_spent mutate(month_start = floor_date(time_min, "month")) |>
mutate(
month_name = month(month_start, label = TRUE, abbr = FALSE),
month_num = month(month_start, label = FALSE),
year = year(month_start)
|>
) group_by(year) |>
summarise(
time_spent_avg = mean(time_diff, na.rm = TRUE),
time_spent_sd = sd(time_diff, na.rm = TRUE)
)
time_spent_by_year
<-
time_spent_by_week_name |>
time_spent mutate(week_start = floor_date(time_min, "week")) |>
mutate(week_num = week(week_start), year = year(week_start)) |>
group_by(week_start, year) |>
summarise(
time_spent_week_avg = mean(time_diff, na.rm = TRUE),
time_spent_week_sd = sd(time_diff, na.rm = TRUE)
)
|>
time_spent_by_week_name ggplot(aes(x = week_start, y = time_spent_week_avg)) +
geom_line(group = 1, color = "grey60") +
geom_point()
<-
time_spent_w_course_university_summary |>
time_spent_w_course_university group_by(floor_date_month) |>
summarise(
distinct_courses_n = n_distinct(course),
diff_time_mean = mean(time_diff, na.rm = TRUE),
n = n()
)
time_spent_w_course_university_summary
Was machen die Visitors eigentlich? Und wie oft?
Für das Objekt n_action_type
wurde die Spalte subtitle
in den Langformat-Daten ausgewertet, s. Funktionsdefinition von count_user_action_type
.
|>
n_action_type head(30)
Achtung: Es kann sinnvoller sein, alternativ zu dieser Analyse die Analyse auf Basis von eventcategory
heranzuziehen. Dort werden alle Arten von Events berücksichtigt. Hier, in der vorliegenden, nur ausgewählte Events.
<-
n_action_type_counted |>
n_action_type drop_na() |>
count(category, sort = TRUE) |>
mutate(prop = round(n / sum(n), 2))
|>
n_action_type_counted gt()
category | n | prop |
---|---|---|
video | 845813 | 0.84 |
click_slideChange | 61934 | 0.06 |
visit_page | 55551 | 0.06 |
Media item | 17485 | 0.02 |
login | 6550 | 0.01 |
in_media_search | 3422 | 0.00 |
Search Results Count | 2856 | 0.00 |
click_topic | 2799 | 0.00 |
Medien | 1646 | 0.00 |
logout | 1495 | 0.00 |
Kanäle | 1395 | 0.00 |
GESOA | 1358 | 0.00 |
click_channelcard | 848 | 0.00 |
Evaluation | 183 | 0.00 |
Data protection | 39 | 0.00 |
<-
n_action_type_per_month |>
n_action_type select(nr, idvisit, category) |>
ungroup() |>
left_join(time_visit_wday |> ungroup()) |>
select(-c(dow, hour, nr)) |>
drop_na() |>
mutate(month_start = floor_date(date_time, "month")) |>
count(month_start, category)
n_action_type_per_month
|>
time_visit_wday head(30)
<-
n_action_type_per_month_top3 |>
n_action_type select(nr, idvisit, category) |>
ungroup() |>
filter(category %in% c("video", "click_slideChange", "visit_page")) |>
left_join(time_visit_wday |> ungroup()) |>
select(-c(dow, hour, nr)) |>
drop_na() |>
mutate(month_start = floor_date(date_time, "month")) |>
count(month_start, category)
n_action_type_per_month_top3
|>
time_visit_wday_fingerprint head(30)
<-
n_action_type_per_month_top3_fingerprint |>
n_action_type select(nr, fingerprint, category) |>
ungroup() |>
filter(category %in% c("video", "click_slideChange", "visit_page")) |>
left_join(time_visit_wday_fingerprint |> ungroup()) |>
select(-c(dow, hour, nr)) |>
drop_na() |>
mutate(month_start = floor_date(date_time, "month")) |>
count(month_start, category)
n_action_type_per_month_top3_fingerprint
<-
n_action_type_course_uni |>
n_action_type left_join(course_and_uni_per_visit |> mutate(idvisit = as.integer(idvisit)))
<-
n_action_type_per_month_top3_per_course |>
n_action_type_course_uni filter(category %in% c("video", "click_slideChange", "visit_page")) |>
drop_na() |>
mutate(month_start = floor_date(actiondetails_0_timestamp, "month")) |>
count(course, month_start, category)
eventcategory
Für folgende Analyse wurde eine andere Variable als oben herangezogen, nämlich eventcategory
. Dadurch resultieren etwas andere Ergebnisse.
<-
data_separated_filtered_count |>
data_separated_filtered filter(type == "eventcategory") |>
count(value, sort = TRUE) |>
mutate(prop = n / sum(n))
data_separated_filtered_count
Als Excel-Datei abspeichern:
#data_separated_filtered_count |>
# writexl::write_xlsx(path = "obj/data_separated_filtered_count.xlsx")
Was ist die Hauptaktivität pro User? - Verteilung
<-
n_action_type_distro |>
n_action_type group_by(idvisit) |>
summarise(category_max = max(category, na.rm = TRUE)) |>
count(category_max)
n_action_type_distro
<-
n_action_type_distro_fingerpr |>
n_action_type group_by(fingerprint) |>
summarise(category_max = max(category, na.rm = TRUE)) |>
count(category_max)
n_action_type_distro
<-
n_action_type_counted |>
n_action_type count(category, sort = TRUE)
<-
n_action_type_course_uni_counted |>
n_action_type_course_uni group_by(course) |>
count(category, sort = TRUE) |>
drop_na()
# Define a vector with the names of the days of the week
# Note: Adjust the start of the week (Sunday or Monday) as per your requirement
<- c(
days_of_week "Monday",
"Tuesday",
"Wednesday",
"Thursday",
"Friday",
"Saturday",
"Sunday"
)
# Replace numbers with day names
$dow2 <- factor(
time_visit_wday$dow],
days_of_week[time_visit_wdaylevels = days_of_week
)
# Define a vector with the names of the days of the week
# Note: Adjust the start of the week (Sunday or Monday) as per your requirement
<- c(
days_of_week "Monday",
"Tuesday",
"Wednesday",
"Thursday",
"Friday",
"Saturday",
"Sunday"
)
# Replace numbers with day names
$dow2 <- factor(
time_visit_wday_fingerprint$dow],
days_of_week[time_visit_wday_fingerprintlevels = days_of_week
)
|>
time_visit_wday_fingerprint as_tibble() |>
count(hour) |>
mutate(prop = n / sum(n)) |>
ggplot(aes(x = hour, y = prop)) +
geom_col() +
theme_minimal() +
labs(
title = "HaNS-Nutzer sind keine Frühaufsteher",
x = "Uhrzeit",
y = "Anteil"
)
# coord_polar()
|>
time_visit_wday as_tibble() |>
count(dow2) |>
mutate(prop = n / sum(n)) |>
ggplot(aes(x = dow2, y = prop)) +
geom_col() +
theme_minimal() +
labs(
title = "Verteilung der HaNS-Logins nach Wochentagen",
x = "Wochentag",
y = "Anteil"
)
# coord_polar()
|>
time_visit_wday as_tibble() |>
count(dow2) |>
mutate(prop = n / sum(n)) |>
ggplot(aes(x = dow2, y = prop)) +
geom_col() +
theme_minimal() +
labs(
title = "Verteilung der HaNS-Logins nach Wochentagen",
x = "Wochentag",
y = "Anteil"
+
) coord_polar()
|>
time_visit_wday_fingerprint as_tibble() |>
count(dow2) |>
mutate(prop = n / sum(n)) |>
ggplot(aes(x = dow2, y = prop)) +
geom_col() +
theme_minimal() +
labs(
title = "Verteilung der HaNS-Logins nach Wochentagen",
x = "Wochentag",
y = "Anteil"
)
# coord_polar()
|>
time_visit_wday as_tibble() |>
count(dow2, hour) |>
group_by(dow2) |>
mutate(prop = n / sum(n)) |>
ggplot(aes(x = hour, y = prop)) +
geom_col() +
facet_wrap(~dow2) +
theme_minimal() +
labs(
title = "Verteilung der HaNS-Logins nach Wochentagen und Uhrzeiten",
x = "Wochentag",
y = "Anteil"
)
# coord_polar()
|>
time_visit_wday as_tibble() |>
count(dow2, hour) |>
group_by(dow2) |>
mutate(prop = n / sum(n)) |>
ggplot(aes(x = hour, y = prop)) +
geom_col() +
facet_wrap(~dow2) +
theme_minimal() +
labs(
title = "Verteilung der HaNS-Logins nach Wochentagen und Uhrzeiten",
x = "Wochentag",
y = "Anteil"
+
) coord_polar()
|>
time_visit_wday_fingerprint as_tibble() |>
count(dow2, hour) |>
group_by(dow2) |>
mutate(prop = n / sum(n)) |>
ggplot(aes(x = hour, y = prop)) +
geom_col() +
facet_wrap(~dow2) +
theme_minimal() +
labs(
title = "Verteilung der HaNS-Logins nach Wochentagen und Uhrzeiten",
x = "Wochentag",
y = "Anteil"
)
# coord_polar()
|>
time_visit_wday_fingerprint as_tibble() |>
count(dow2, hour) |>
group_by(dow2) |>
mutate(prop = n / sum(n)) |>
ggplot(aes(x = hour, y = prop)) +
geom_col() +
facet_wrap(~dow2) +
theme_minimal() +
labs(
title = "Verteilung der HaNS-Logins nach Wochentagen und Uhrzeiten",
x = "Wochentag",
y = "Anteil"
+
) coord_polar()
<-
time2 |>
time_visit_wday ungroup() |>
mutate(date = as.Date(date_time)) |>
mutate(month_start = floor_date(date_time, "month"))
|>
time2 ggplot(aes(x = date, y = hour)) +
geom_bin2d(binwidth = c(1, 1)) + # (1 day, 1 hour)
scale_x_date(date_breaks = "1 month") +
theme(legend.position = "bottom") +
scale_fill_viridis_c() +
labs(caption = "Each x-bin maps to one week") +
scale_x_date(breaks = breaks_pretty())
<-
time2_fingerprint |>
time_visit_wday_fingerprint ungroup() |>
mutate(date = as.Date(date_time)) |>
mutate(month_start = floor_date(date_time, "month"))
|>
time2_fingerprint ggplot(aes(x = date, y = hour)) +
geom_bin2d(binwidth = c(1, 1)) + # (1 day, 1 hour)
scale_x_date(date_breaks = "1 month") +
theme(legend.position = "bottom") +
scale_fill_viridis_c() +
labs(caption = "Each x-bin maps to one week") +
scale_x_date(breaks = breaks_pretty())
|>
time2 ggplot(aes(x = date, y = hour)) +
geom_bin2d(binwidth = c(7, 1)) + # 1 week, 1 hour
scale_x_date(date_breaks = "1 week", date_labels = "%W") +
theme(legend.position = "bottom") +
scale_fill_viridis_c() +
labs(
x = "Week number in 2023/2024",
caption = "Each x-bin maps to one week"
+
) scale_x_date(breaks = breaks_pretty())
|>
time2_fingerprint ggplot(aes(x = date, y = hour)) +
geom_bin2d(binwidth = c(7, 1)) + # 1 week, 1 hour
scale_x_date(date_breaks = "1 week", date_labels = "%W") +
theme(legend.position = "bottom") +
scale_fill_viridis_c() +
labs(
x = "Week number in 2023/2024",
caption = "Each x-bin maps to one week"
+
) scale_x_date(breaks = breaks_pretty())
|>
time2 ggplot(aes(x = date, y = dow)) +
geom_bin2d(binwidth = c(7, 1)) + # 1 week, 1 hour
scale_x_date(date_breaks = "1 week", date_labels = "%W") +
theme(legend.position = "bottom") +
scale_fill_viridis_c() +
labs(
x = "Week number in 2023/2024",
caption = "Each x-bin maps to one week",
y = "Day of Week"
+
) scale_y_continuous(breaks = 1:7) +
scale_x_date(breaks = breaks_pretty())
|>
time2_fingerprint ggplot(aes(x = date, y = dow)) +
geom_bin2d(binwidth = c(7, 1)) + # 1 week, 1 hour
scale_x_date(date_breaks = "1 week", date_labels = "%W") +
theme(legend.position = "bottom") +
scale_fill_viridis_c() +
labs(
x = "Week number in 2023/2024",
caption = "Each x-bin maps to one week",
y = "Day of Week"
+
) scale_y_continuous(breaks = 1:7) +
scale_x_date(breaks = breaks_pretty())
Berechnungsgrundlage: Für diese Analyse wurden alle Events der Kategorie llm
gefiltert.
<-
data_separated_filtered_ai |>
data_separated_filtered filter(type == "eventcategory") |>
filter(str_detect(value, "llm")) |>
count(value, sort = TRUE) |>
mutate(prop = n / sum(n))
data_separated_filtered_ai
message_to_llm
<-
llm_interactions |>
data_separated_filtered filter(str_detect(value, "message_to_llm"))
<-
llm_interactions_count |>
llm_interactions count(idvisit, sort = TRUE) |>
rename(messages_to_llm_n = n)
|>
llm_interactions_count describe_distribution(messages_to_llm_n, centrality = c("mean", "median"))
<-
data_separated_filtered_llm_interact |>
data_separated_filtered mutate(has_llm = str_detect(value, "llm")) |>
group_by(idvisit) |>
summarise(llm_used_during_visit = any(has_llm == TRUE)) |>
count(llm_used_during_visit) |>
mutate(prop = round(n / sum(n), 2))
|>
data_separated_filtered_llm_interact gt()
llm_used_during_visit | n | prop |
---|---|---|
FALSE | 13419 | 0.94 |
TRUE | 788 | 0.06 |
<-
data_separated_filtered_llm_interact_fingerprint |>
data_separated_filtered mutate(has_llm = str_detect(value, "llm")) |>
group_by(fingerprint) |>
summarise(llm_used_during_visit = any(has_llm == TRUE)) |>
count(llm_used_during_visit) |>
mutate(prop = round(n / sum(n), 2))
|>
data_separated_filtered_llm_interact_fingerprint gt()
llm_used_during_visit | n | prop |
---|---|---|
FALSE | 6649 | 0.93 |
TRUE | 511 | 0.07 |
|>
idvisit_has_llm head(30)
<-
idvisit_has_llm_timeline |>
idvisit_has_llm count(year_month, uses_llm) |>
ungroup() |>
group_by(year_month) |>
mutate(prop = round(n / sum(n), 2))
idvisit_has_llm_timeline
|>
idvisit_has_llm count(year_month, uses_llm) |>
ungroup() |>
mutate(year_month_date = ymd(paste0(year_month, "-01"))) |>
group_by(year_month_date) |>
mutate(prop = n / sum(n)) |>
ggplot(aes(
x = year_month_date,
y = prop,
color = uses_llm,
groups = uses_llm
+
)) geom_point() +
geom_line(aes(group = uses_llm)) +
labs(
title = "Visitors, die mit dem LLM interagieren im Zeitverlauf (Anteile)"
+
) scale_x_date(breaks = pretty_breaks())
|>
idvisit_has_llm count(year_month, uses_llm) |>
ungroup() |>
mutate(year_month_date = ymd(paste0(year_month, "-01"))) |>
group_by(year_month) |>
ggplot(aes(x = year_month_date, y = n, color = uses_llm, groups = uses_llm)) +
geom_point() +
geom_line(aes(group = uses_llm)) +
labs(
title = "Visitors, die mit dem LLM interagieren im Zeitverlauf (Anzahl)"
+
) scale_x_date(breaks = pretty_breaks())
<-
d_n_interactions_w_llm |>
data_separated_filtered filter(type == "eventcategory") |>
filter(str_detect(value, "llm")) |>
group_by(idvisit) |>
summarise(n_interactions_w_llm = n())
|>
d_n_interactions_w_llm select(n_interactions_w_llm) |>
describe_distribution() |>
print_md()
Variable | Mean | SD | IQR | Range | Skewness | Kurtosis | n | n_Missing |
---|---|---|---|---|---|---|---|---|
n_interactions_w_llm | 165.59 | 216.45 | 459 | (1.00, 500.00) | 0.69 | -1.46 | 640 | 0 |
Ausgewertet wird im Folgenden die Variable “click_transcript_word”.
|>
data_separated_filtered filter(type == "subtitle") |>
# rm empty rows:
filter(!is.na(value) & value != "") |>
count(click_transcript_word = str_detect(value, "click_transcript_word")) |>
mutate(prop = round(n / sum(n), 2)) |>
tt()
click_transcript_word | n | prop |
---|---|---|
FALSE | 1138774 | 0.99 |
TRUE | 8439 | 0.01 |
<-
click_transcript_word_per_month |>
data_separated_filtered # rm all groups WITHOUT "click_transcript_word":
group_by(idvisit) |>
filter(!any(value = str_detect(value, "click_transcript_word"))) |>
ungroup() |>
mutate(date_visit = ymd_hms(value)) |>
mutate(month_visit = floor_date(date_visit, unit = "month")) |>
drop_na(date_visit) |>
group_by(idvisit) |>
slice(1) |>
ungroup() |>
count(month_visit)
click_transcript_word_per_month
<-
click_transcript_word_per_month_fingerprint |>
data_separated_filtered # rm all groups WITHOUT "click_transcript_word":
group_by(fingerprint) |>
filter(!any(value = str_detect(value, "click_transcript_word"))) |>
ungroup() |>
mutate(date_visit = ymd_hms(value)) |>
mutate(month_visit = floor_date(date_visit, unit = "month")) |>
drop_na(date_visit) |>
group_by(fingerprint) |>
slice(1) |>
ungroup() |>
count(month_visit)
click_transcript_word_per_month_fingerprint
|>
data_long head(300)
<- "Category: \"(.*?)(?=', Action)"
regex_pattern
# Explaining this regex_pattern:
# Find the literal string
# 1. `Category: ` (surrounded by quotation marks)
# 2. Capture any characters (.*?) that follow, non-greedily, until...
# 3. ...it encounters the literal sequence, ` Action`) immediately after the captured string.
<-
ai_actions_count |>
data_long # slice(1:1000) |>
filter(str_detect(value, "transcript")) |>
mutate(category = str_extract(value, regex_pattern)) |>
select(category) |>
mutate(category = str_replace_all(category, "[\"']", "")) |>
count(category, sort = TRUE)
|>
ai_actions_count tt()
category | n |
---|---|
NA | 217862 |
Category: clear_transcript_text_for_llm_context | 104111 |
Category: click_transcript_word | 8439 |
Category: select_transcript_text_for_llm_context | 576 |
Category: click_button | 43 |
Category: llm_response_de | 3 |
Category: llm_response_en | 3 |
Im Objekt wird gezählt, wie oft der String "click_transcript_word"
in den Daten (Langformat) gefunden wird, s. Target ai_transcript_clicks_per_month
in der Targets-Pipeline.
|>
ai_transcript_clicks_per_month head(30)
<-
ai_transcript_clicks_per_month_count |>
ai_transcript_clicks_per_month count(year_month, clicks_transcript_any) |>
ungroup() |>
group_by(year_month) |>
mutate(prop = round(n / sum(n), 2))
ai_transcript_clicks_per_month_count
llm_response
- Tokens und Tokenlänge|>
llm_response_text count(lang) |>
mutate(prob = n / sum(n))
|>
llm_response_text describe_distribution(select = "tokens_n")
verify_option_wrong
und verify_option_correct
<-
verify_option_summary |>
data_separated_filtered group_by(idvisit) |>
filter(value == "verify_option_wrong" | value == "verify_option_correct") |>
summarise(verify_option = n())
|>
verify_option_summary describe_distribution(verify_option) |>
print_md()
Variable | Mean | SD | IQR | Range | Skewness | Kurtosis | n | n_Missing |
---|---|---|---|---|---|---|---|---|
verify_option | 35.24 | 36.76 | 30 | (4.00, 245.00) | 2.64 | 9.11 | 207 | 0 |
# verify_option_summary_fingerprint <-
# data_separated_filtered |>
# group_by(fingerprint) |>
# filter(value == "verify_option_wrong" | value == "verify_option_correct") |>
# summarise(verify_option = n())
setDT(data_separated_filtered) # Ensure your data frame is a data.table
<- data_separated_filtered[
verify_option_summary_fingerprint # 1. Filtering (i)
%in% c("verify_option_wrong", "verify_option_correct"),
value
# 2. Summarize (.j) - calculate the count (n)
verify_option = .N),
.(
# 3. Grouping (by)
= .(fingerprint)
by
]
<- as_tibble(
verify_option_summary_fingerprint
verify_option_summary_fingerprint )
|>
verify_option_summary_fingerprint describe_distribution(verify_option) |>
print_md()
Variable | Mean | SD | IQR | Range | Skewness | Kurtosis | n | n_Missing |
---|---|---|---|---|---|---|---|---|
verify_option | 41.68 | 46.81 | 35 | (4.00, 252.00) | 2.35 | 5.94 | 175 | 0 |
verify_option_wrong
verify_option_div_by_4 - geteilt durch 4<-
verify_option_summary |>
verify_option_summary mutate(verify_option_div_by_4 = verify_option / 4)
|>
verify_option_summary gghistogram(x = "verify_option_div_by_4")
|>
verify_option_summary mutate(verify_option_div_by_4 = verify_option / 4) |>
describe_distribution(verify_option_div_by_4) |>
print_md()
Variable | Mean | SD | IQR | Range | Skewness | Kurtosis | n | n_Missing |
---|---|---|---|---|---|---|---|---|
verify_option_div_by_4 | 8.81 | 9.19 | 7.50 | (1.00, 61.25) | 2.64 | 9.11 | 207 | 0 |
<-
check_if_both_methods_give_same_number |>
n_mc_answers_selected full_join(verify_option_summary)
|>
check_if_both_methods_give_same_number head(20) |>
gt()
n | verify_option | verify_option_div_by_4 |
---|---|---|
1560 | ||
6 | 28 | 7.00 |
1569 | ||
2 | 14 | 3.50 |
2021 | ||
2 | 21 | 5.25 |
2022 | ||
10 | 126 | 31.50 |
2394 | ||
2 | 21 | 5.25 |
2718 | ||
2 | 7 | 1.75 |
2740 | ||
2 | 7 | 1.75 |
2883 | ||
2 | 126 | 31.50 |
2902 | ||
2 | 126 | 31.50 |
2912 | ||
2 | 77 | 19.25 |
2932 | ||
6 | 35 | 8.75 |
2950 | ||
2 | 56 | 14.00 |
2978 | ||
14 | 245 | 61.25 |
2979 | ||
4 | 35 | 8.75 |
3103 | ||
2 | 14 | 3.50 |
3257 | ||
2 | 7 | 1.75 |
3691 | ||
2 | 35 | 8.75 |
3700 | ||
4 | 84 | 21.00 |
3741 | ||
2 | 21 | 5.25 |
3804 | ||
2 | 70 | 17.50 |
Nein, beide Methoden liefern nicht die gleiche Zahl.
<-
mc_answers_with_timestamps |>
mc_answers_with_timestamps mutate(month_start = floor_date(timestamp, "month")) |>
ungroup() |>
arrange(timestamp) |>
mutate(n_cumulated = cumsum(n)) |>
mutate(date = as.Date(timestamp))
<- c(
lim min(mc_answers_with_timestamps$date),
max(mc_answers_with_timestamps$date)
)
|>
mc_answers_with_timestamps ggplot(aes(x = date, y = n_cumulated)) +
scale_x_date(limits = lim, labels = scales::label_date_short()) +
geom_point() +
geom_line()
generate_questionaire
# generate_questionaire_summary <-
# data_separated_filtered |>
# group_by(idvisit) |>
# filter(value == "generate_questionaire") |>
# summarise(generate_questionaire = n())
setDT(data_separated_filtered) # Convert the data.frame to a data.table in place
<- data_separated_filtered[
generate_questionaire_summary # 1. Filtering (i)
== "generate_questionaire",
value
# 2. Summarize (.j) - calculate the count (.N) and rename it
generate_questionaire = .N),
.(
# 3. Grouping (by)
= .(idvisit)
by ]
|>
generate_questionaire_summary describe_distribution(generate_questionaire) |>
print_md()
Variable | Mean | SD | IQR | Range | Skewness | Kurtosis | n | n_Missing |
---|---|---|---|---|---|---|---|---|
generate_questionaire | 3.11 | 5.93 | 2 | (1.00, 66.00) | 5.89 | 46.07 | 367 | 0 |
setDT(generate_questionaire_summary)
setDT(verify_option_summary)
# 1. Full Join (Merge)
# Use the 'merge' function with all.x=TRUE and all.y=TRUE for a full join
# Assumes the join column is 'idvisit' as used in your previous examples
<- merge(
prior_existing_questions_summary
generate_questionaire_summary,
verify_option_summary,by = "idvisit",
all = TRUE
)
# 2. Mutate (Calculation)
# Use .j to create the new column
prior_existing_questions_summary[,:= verify_option - generate_questionaire
prior_existing_questions_n
]
# prior_existing_questions_summary <-
# generate_questionaire_summary |>
# full_join(verify_option_summary) |>
# mutate(prior_existing_questions_n = verify_option - generate_questionaire)
|>
prior_existing_questions_summary # drop_na() |>
gghistogram(x = "prior_existing_questions_n")
|>
prior_existing_questions_summary describe_distribution(prior_existing_questions_n) |>
print_md()
Variable | Mean | SD | IQR | Range | Skewness | Kurtosis | n | n_Missing |
---|---|---|---|---|---|---|---|---|
prior_existing_questions_n | 38.87 | 44.57 | 39 | (-59.00, 236.00) | 1.98 | 5.40 | 91 | 392 |
Wie viel Zeit verbringen die Nutzer mit dem Betrachten von Videos (“Glotzdauer”)?
Achtung: Die Videozeit ist schwierig auszuwerten. Die Nutzer beenden keine Videos, in dem sie auf “Pause” drücken, sondern indem sie andere Aktionen durchführen. Dies ist aber analytisch schwer abzubilden.
Vgl. die Definition des Targets glotzdauer
in der Pipeline.
Kurz gesagt wird die Zeit-Differenz zwischen zwei aufeinander folgenden “Play” und “Pause” Aktionen berechnet.
Allerdings hat dieses Vorgehen Schwierigkeiten: Nicht immer folgt auf einem “Play” ein “Pause”. Es ist schwer auszuwerten, wann die Betrachtung eines Videos endet. Daher ist diese Analyse nur vorsichtig zu interpretieren.
Die Definition der Funktion glotzdauer.R ist online dokumentiert.
|>
data_separated_distinct_slice head(30)
Für die folgende Darstellung wurden die absoluten Zeitwerte verwendet, d.h. ohne Vorzeichen.
|>
data_separated_distinct_slice # we will assume that negative glotzdauer is the as positive glotzdauer:
mutate(time_diff = abs(time_diff)) |>
# without glotzdauer smaller than 10 minutes:
filter(time_diff < 60 * 10) |>
ggplot(aes(x = time_diff)) +
geom_histogram() +
scale_x_time() +
labs(
x = "Time interval [minutes]",
caption = "Only time intervals less than 10 minutes. It is assumed that video time is positive only (no negative time intervals)."
+
) theme_minimal()
<-
glotzdauer_prepped |>
data_separated_distinct_slice # we will assume that negative glotzdauer is the as positive glotzdauer:
mutate(time_diff_abs_sec = abs(as.numeric(time_diff, units = "secs"))) |>
# without glotzdauer smaller than 10 minutes:
filter(time_diff_abs_sec < 60 * 10) |>
mutate(time_diff_abs_min = time_diff_abs_sec / 60)
<-
glotzdauer_tbl |>
glotzdauer_prepped select(time_diff_abs_sec, time_diff_abs_min) |>
describe_distribution()
glotzdauer_tbl
<-
glotzdauer_prepped_tbl |>
glotzdauer_prepped mutate(first_of_month = floor_date(date, unit = "month")) |>
group_by(first_of_month) |>
summarise(time_diff_mean = mean(time_diff, na.rm = TRUE))
glotzdauer_prepped_tbl
.