library(tidyverse)
#library(stringr) # Strings verarbeiten
library(here) # liest aktuelles Verzeichnis aus
library(janitor)
#library(lubridate) # Mit Zeitangaben arbeiten
library(tictoc)
library(data.table)Challenge 03 – Solution
“Eine Hackathon-Variante zur Evaluation der Klickdaten des KI-Tools ‘HaNS’”
1 Daten verstehen
1.1 Setup
Wir laden den Datensatz, wie im letzten Schritt herausgegangen:
tic()
d_file_path <- "https://raw.githubusercontent.com/sebastiansauer/hans-hackathon2025/refs/heads/main/objects/d_students_only.csv"
d_input <- read_csv(d_file_path,
col_types = cols(.default = "c"))
toc()
## 0.755 sec elapsedd_input_names_sanitized <-
d_input |> clean_names()
names(d_input_names_sanitized) |>
head(10)
## [1] "x1" "id_visit"
## [3] "fingerprint" "action_details_0_subtitle"
## [5] "action_details_0_timestamp" "action_details_1_timestamp"
## [7] "action_details_1_subtitle" "action_details_2_subtitle"
## [9] "action_details_2_timestamp" "action_details_3_timestamp"d_input_names_sanitized |>
select(1:20) |>
glimpse()
## Rows: 376
## Columns: 20
## $ x1 <chr> "1", "2", "3", "4", "5", "6", "7", "8", "9"…
## $ id_visit <chr> "3294", "3292", "3293", "3291", "3290", "32…
## $ fingerprint <chr> "b60fd403ef2a6ad5", "f1f2268d3eb2725a", "95…
## $ action_details_0_subtitle <chr> "https://hswt.de&password=<ljse9wp0hps9y/lo…
## $ action_details_0_timestamp <chr> "2025-07-07 23:16:05", "2025-07-07 22:07:02…
## $ action_details_1_timestamp <chr> "2025-07-07 23:16:10", "2025-07-07 22:07:04…
## $ action_details_1_subtitle <chr> "Category: \"login', Action: \"success\"", …
## $ action_details_2_subtitle <chr> "https://hswt.de&password=<ljse9wp0hps9y/?e…
## $ action_details_2_timestamp <chr> "2025-07-07 23:16:10", "2025-07-07 22:07:04…
## $ action_details_3_timestamp <chr> "2025-07-07 23:16:25", "2025-07-07 22:07:12…
## $ action_details_3_subtitle <chr> "Category: \"click_videocard_search_semeste…
## $ action_details_4_timestamp <chr> "2025-07-07 23:16:25", "2025-07-07 22:07:12…
## $ action_details_4_subtitle <chr> "Category: \"click_videocard_search_lecture…
## $ action_details_5_subtitle <chr> "Prof. Dr. Mircea Tric", "Category: \"stati…
## $ action_details_5_timestamp <chr> "2025-07-07 23:16:35", "2025-07-07 22:07:14…
## $ action_details_6_subtitle <chr> "https://hswt.de&password=<ljse9wp0hps9y/se…
## $ action_details_6_timestamp <chr> "2025-07-07 23:16:35", "2025-07-07 22:07:14…
## $ action_details_7_timestamp <chr> "2025-07-07 23:16:57", "2025-07-07 22:07:14…
## $ action_details_7_subtitle <chr> "Category: \"click_videocard', Action: \"02…
## $ action_details_8_subtitle <chr> "https://hswt.de&password=<ljse9wp0hps9y/vi…1.2 Lösungen
1.2.1 1 Pivotieren
Mit id_visit als ID:
tic()
d_long_idvisit <-
d_input_names_sanitized |>
select(-x1) |>
pivot_longer(cols = -id_visit)
toc()
## 0.059 sec elapsedglimpse(d_long_idvisit)
## Rows: 377,504
## Columns: 3
## $ id_visit <chr> "3294", "3294", "3294", "3294", "3294", "3294", "3294", "3294…
## $ name <chr> "fingerprint", "action_details_0_subtitle", "action_details_0…
## $ value <chr> "b60fd403ef2a6ad5", "https://hswt.de&password=<ljse9wp0hps9y/…1.2.2 Pivotieren mit fingerprint als ID
tic()
d_long_fingerprint <-
d_input_names_sanitized |>
select(-x1) |>
pivot_longer(cols = -fingerprint)
toc()
## 0.058 sec elapsedglimpse(d_long_fingerprint)
## Rows: 377,504
## Columns: 3
## $ fingerprint <chr> "b60fd403ef2a6ad5", "b60fd403ef2a6ad5", "b60fd403ef2a6ad5"…
## $ name <chr> "id_visit", "action_details_0_subtitle", "action_details_0…
## $ value <chr> "3294", "https://hswt.de&password=<ljse9wp0hps9y/login?eva…1.2.3 Pivotieren mit beiden ID-Variablen
tic()
d_long_idvisit_fingerprint <-
d_input_names_sanitized |>
select(-x1) |>
pivot_longer(cols = -c(id_visit, fingerprint))
toc()
## 0.062 sec elapsedglimpse(d_long_idvisit_fingerprint)
## Rows: 377,128
## Columns: 4
## $ id_visit <chr> "3294", "3294", "3294", "3294", "3294", "3294", "3294", "3…
## $ fingerprint <chr> "b60fd403ef2a6ad5", "b60fd403ef2a6ad5", "b60fd403ef2a6ad5"…
## $ name <chr> "action_details_0_subtitle", "action_details_0_timestamp",…
## $ value <chr> "https://hswt.de&password=<ljse9wp0hps9y/login?evalId=none…1.2.4 Pivotieren und Begrenzen Sie beim Pivotieren die Spalten auf die Spaltentypen - id_visit
d_input_names_sanitized_only_subtitle_cols <-
d_input_names_sanitized |>
select(id_visit, contains("subtitle"))
tic()
d_long_idvisit_only_subtitle_cols_idvisit <-
d_input_names_sanitized_only_subtitle_cols |>
pivot_longer(cols = -id_visit)
toc()
## 0.023 sec elapsedglimpse(d_long_idvisit_only_subtitle_cols_idvisit)
## Rows: 188,000
## Columns: 3
## $ id_visit <chr> "3294", "3294", "3294", "3294", "3294", "3294", "3294", "3294…
## $ name <chr> "action_details_0_subtitle", "action_details_1_subtitle", "ac…
## $ value <chr> "https://hswt.de&password=<ljse9wp0hps9y/login?evalId=none&us…BONUS:
Den Zeitverbrauch kann man sich mit profvis näher anschauen:
library(profvis)
profvis({
d_long_idvisit_only_subtitle_cols_idvisit <-
d_input_names_sanitized_only_subtitle_cols %>%
pivot_longer(cols = -id_visit)
})1.2.5 Pivotieren und Begrenzen Sie beim Pivotieren die Spalten auf die Spaltentypen - fingerprint
d_input_names_sanitized_only_subtitle_cols_fingerprint <-
d_input_names_sanitized |>
select(fingerprint, contains("subtitle"))
tic()
d_long_idvisit_only_subtitle_cols_fingerprint <-
d_input_names_sanitized_only_subtitle_cols_fingerprint |>
pivot_longer(cols = -fingerprint)
toc()
## 0.034 sec elapsedglimpse(d_long_idvisit_only_subtitle_cols_fingerprint)
## Rows: 188,000
## Columns: 3
## $ fingerprint <chr> "b60fd403ef2a6ad5", "b60fd403ef2a6ad5", "b60fd403ef2a6ad5"…
## $ name <chr> "action_details_0_subtitle", "action_details_1_subtitle", …
## $ value <chr> "https://hswt.de&password=<ljse9wp0hps9y/login?evalId=none…1.2.6 Pivotieren und Begrenzen Sie beim Pivotieren die Spalten auf die Spaltentypen - idvisit und fingerprint
d_input_names_sanitized_only_subtitle_cols_idvisit_fingerprint <-
d_input_names_sanitized |>
select(id_visit, fingerprint, contains("subtitle"))
tic()
d_long_idvisit_only_subtitle_cols_idvisit_fingerprint <-
d_input_names_sanitized_only_subtitle_cols_idvisit_fingerprint |>
pivot_longer(cols = -c(id_visit, fingerprint))
toc()
## 0.037 sec elapsedglimpse(d_long_idvisit_only_subtitle_cols_idvisit_fingerprint)
## Rows: 188,000
## Columns: 4
## $ id_visit <chr> "3294", "3294", "3294", "3294", "3294", "3294", "3294", "3…
## $ fingerprint <chr> "b60fd403ef2a6ad5", "b60fd403ef2a6ad5", "b60fd403ef2a6ad5"…
## $ name <chr> "action_details_0_subtitle", "action_details_1_subtitle", …
## $ value <chr> "https://hswt.de&password=<ljse9wp0hps9y/login?evalId=none…1.2.7 rüfen Sie, ob es stimmt, dass in der Spalte …
d_long_idvisit_only_subtitle_cols_idvisit_fingerprint %>%
pull(name) %>%
str_remove("\\d+") %>% # remove digits from each string
unique() %>% # keep only unique values
length() # count them
## [1] 1Ja, ist konstant ohne die Zahlen in der Mitte
1.2.8 Zahl extrahieren aus name
d_long_idvisit_only_subtitle_cols_idvisit_fingerprint |>
mutate(id = str_extract(name, "\\d+")) |>
select(-name)1.2.9 Schneller als pivot_longer
data.table ist eines der bekanntesten R-Pakete. Es glänzt durch Geschwindigkeit.
vars_to_pivot <-
d_input_names_sanitized %>%
select(contains("action_details_")) %>%
names()
DT <- as.data.table(d_input_names_sanitized)
tic()
out <- melt(DT,
id.vars = c("id_visit", "fingerprint"),
measure.vars = vars_to_pivot,
variable.name = "name",
value.name = "value"
)
toc()
## 0.013 sec elapsed1.2.10 Bonus: Wer ist schneller?

data.table ist deutlich schneller als tidyverse:
- https://duckdblabs.github.io/db-benchmark/
- https://towardsdatascience.com/data-table-speed-with-dplyr-syntax-yes-we-can-51ef9aaed585/
- https://timfarewell.co.uk/is-data-table-or-dplyr-faster-at-summarising-data/
- https://codepointtech.com/scale-tidyverse-to-big-data-master-dtplyr-for-dplyr-data-table/
1.3 Outro
Als RDS-Datei:
tic()
write_rds(d_long_idvisit_only_subtitle_cols_idvisit,
paste0(here(),"/objects/d_long_idvisit_only_subtitle_cols_idvisit.rds"))
toc()
## 0.134 sec elapsed
write_rds(d_long_idvisit_only_subtitle_cols_idvisit_fingerprint,
paste0(here(),"/objects/d_long_idvisit_only_subtitle_cols_idvisit_fingerprint.rds"))Als Text-Datei:
tic()
write_csv(d_long_idvisit_only_subtitle_cols_idvisit,
paste0(here(),"/objects/d_long_idvisit_only_subtitle_cols_idvisit.csv"))
toc()
## 0.064 sec elapsed
write_csv(d_long_idvisit_only_subtitle_cols_idvisit_fingerprint,
paste0(here(),"/objects/d_long_idvisit_only_subtitle_cols_idvisit_fingerprint.csv"))2 SessionInfo
sessionInfo()
## R version 4.5.1 (2025-06-13)
## Platform: x86_64-pc-linux-gnu
## Running under: Ubuntu 25.10
##
## Matrix products: default
## BLAS: /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.12.1
## LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.12.1; LAPACK version 3.12.0
##
## locale:
## [1] LC_CTYPE=de_DE.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=de_DE.UTF-8 LC_COLLATE=de_DE.UTF-8
## [5] LC_MONETARY=de_DE.UTF-8 LC_MESSAGES=de_DE.UTF-8
## [7] LC_PAPER=de_DE.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=de_DE.UTF-8 LC_IDENTIFICATION=C
##
## time zone: Europe/Berlin
## tzcode source: system (glibc)
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] data.table_1.17.8 tictoc_1.2.1 janitor_2.2.1 here_1.0.1
## [5] lubridate_1.9.4 forcats_1.0.0 stringr_1.6.0 dplyr_1.1.4
## [9] purrr_1.2.0 readr_2.1.6 tidyr_1.3.1 tibble_3.3.0
## [13] ggplot2_4.0.1 tidyverse_2.0.0
##
## loaded via a namespace (and not attached):
## [1] generics_0.1.4 stringi_1.8.7 hms_1.1.3 digest_0.6.39
## [5] magrittr_2.0.4 evaluate_1.0.5 grid_4.5.1 timechange_0.3.0
## [9] RColorBrewer_1.1-3 fastmap_1.2.0 rprojroot_2.1.1 jsonlite_2.0.0
## [13] scales_1.4.0 codetools_0.2-20 cli_3.6.5 crayon_1.5.3
## [17] rlang_1.1.6 bit64_4.6.0-1 withr_3.0.2 yaml_2.3.10
## [21] parallel_4.5.1 tools_4.5.1 tzdb_0.5.0 curl_7.0.0
## [25] vctrs_0.6.5 R6_2.6.1 lifecycle_1.0.4 snakecase_0.11.1
## [29] htmlwidgets_1.6.4 bit_4.6.0 vroom_1.6.5 pkgconfig_2.0.3
## [33] pillar_1.11.1 gtable_0.3.6 glue_1.8.0 xfun_0.54
## [37] tidyselect_1.2.1 rstudioapi_0.17.1 knitr_1.50 dichromat_2.0-0.1
## [41] farver_2.1.2 htmltools_0.5.8.1 rmarkdown_2.30 compiler_4.5.1
## [45] S7_0.2.1Wiederverwendung
MIT
Zitat
Mit BibTeX zitieren:
@online{sauer2026,
author = {Sauer, Sebastian},
title = {Challenge 03 -\/- Solution},
date = {2026-01-12},
url = {https://sebastiansauer.github.io/hans-hackathon2025/challenge03-solution.html},
langid = {de-DE}
}
Bitte zitieren Sie diese Arbeit als:
Sauer, Sebastian. 2026. “Challenge 03 -- Solution.” January
12, 2026. https://sebastiansauer.github.io/hans-hackathon2025/challenge03-solution.html.