idvisit_has_llm|>count(year_month, uses_llm)|>ungroup()|>mutate(year_month_date =ymd(paste0(year_month, "-01")))|>group_by(year_month_date)|>mutate(prop =n/sum(n))|>ggplot(aes( x =year_month_date, y =prop, color =uses_llm, groups =uses_llm))+# --- Highlight March–July (approx 1 Mar to 31 Jul) ---annotate("rect", xmin =as.Date("2023-03-01"), xmax =as.Date("2023-07-31"), ymin =-Inf, ymax =Inf, alpha =0.2, fill ="skyblue")+annotate("rect", xmin =as.Date("2024-03-01"), xmax =as.Date("2024-07-31"), ymin =-Inf, ymax =Inf, alpha =0.2, fill ="skyblue")+annotate("rect", xmin =as.Date("2025-03-01"), xmax =as.Date("2025-07-31"), ymin =-Inf, ymax =Inf, alpha =0.2, fill ="skyblue")+# --- Highlight October–February (semester break or 2nd term) ---annotate("rect", xmin =as.Date("2023-10-01"), xmax =as.Date("2024-02-28"), ymin =-Inf, ymax =Inf, alpha =0.2, fill ="orange")+# annotate("rect",# xmin = as.Date("2024-10-01"), xmax = as.Date("2024-02-28"),# ymin = -Inf, ymax = Inf, alpha = 0.2, fill = "orange") +annotate("rect", xmin =as.Date("2024-10-01"), xmax =as.Date("2025-02-28"), ymin =-Inf, ymax =Inf, alpha =0.2, fill ="orange")+geom_point()+geom_line(aes(group =uses_llm))+labs( title ="Visitors, die mit dem LLM interagieren im Zeitverlauf (Anteile)")+scale_x_date(breaks =pretty_breaks())
Show the code
idvisit_has_llm|>count(year_month, uses_llm)|>ungroup()|>mutate(year_month_date =ymd(paste0(year_month, "-01")))|>group_by(year_month)|>ggplot(aes(x =year_month_date, y =n, color =uses_llm, groups =uses_llm))+# --- Highlight March–July (approx 1 Mar to 31 Jul) ---annotate("rect", xmin =as.Date("2023-03-01"), xmax =as.Date("2023-07-31"), ymin =-Inf, ymax =Inf, alpha =0.2, fill ="skyblue")+annotate("rect", xmin =as.Date("2024-03-01"), xmax =as.Date("2024-07-31"), ymin =-Inf, ymax =Inf, alpha =0.2, fill ="skyblue")+annotate("rect", xmin =as.Date("2025-03-01"), xmax =as.Date("2025-07-31"), ymin =-Inf, ymax =Inf, alpha =0.2, fill ="skyblue")+# --- Highlight October–February (semester break or 2nd term) ---annotate("rect", xmin =as.Date("2023-10-01"), xmax =as.Date("2024-02-28"), ymin =-Inf, ymax =Inf, alpha =0.2, fill ="orange")+# annotate("rect",# xmin = as.Date("2024-10-01"), xmax = as.Date("2024-02-28"),# ymin = -Inf, ymax = Inf, alpha = 0.2, fill = "orange") +annotate("rect", xmin =as.Date("2024-10-01"), xmax =as.Date("2025-02-28"), ymin =-Inf, ymax =Inf, alpha =0.2, fill ="orange")+geom_point()+geom_line(aes(group =uses_llm))+labs( title ="Visitors, die mit dem LLM interagieren im Zeitverlauf (Anzahl)")+scale_x_date(breaks =pretty_breaks())
8.4.3.2 Anteile
Show the code
idvisit_has_llm|>count(year_month, uses_llm)|>ungroup()|>mutate(year_month_date =ymd(paste0(year_month, "-01")))|>group_by(year_month_date)|># ADDED: Calculate the proportionmutate(proportion =n/sum(n))|># Plot using the new 'proportion' variableggplot(aes(x =year_month_date, y =proportion, fill =uses_llm))+# ADDED: Use position = "fill"geom_area(position ="fill")+# ADDED: Format y-axis as percentagescale_y_continuous(labels =scales::label_percent())+labs( title ="Anteil der Besucher, die mit dem LLM interagieren (Prozent)", y ="Prozentualer Anteil der Besucher", fill ="Interagiert mit LLM", x ="Datum")+scale_x_date(breaks =pretty_breaks())
prompt_length_no_prompts|>ggplot(aes(x =token_length))+geom_histogram(binwidth =10)+labs( title ="Length of prompts sent to the LLM", x ="Prompt length (in tokens)", y ="Number of prompts")+theme_minimal()
ggboxplot(prompt_length_date_uni_course, x ="university", y ="token_length", add ="mean_se",)+theme_minimal()+labs( title ="Prompt length by university", x ="University", y ="Prompt length (in tokens)")+coord_flip()
ggboxplot(prompt_length_date_uni_course, x ="course", y ="token_length", add ="mean_se",)+theme_minimal()+labs( title ="Prompt length by course", x ="Course", y ="Prompt length (in tokens)")+coord_flip()
# Calculate limits properlyfiltered_data<-prompt_length_date_uni_course|>filter(!is.na(floor_date_month))|>mutate(floor_date_month_date =as.Date(floor_date_month))lim<-c(min(filtered_data$floor_date_month_date, na.rm =TRUE),max(filtered_data$floor_date_month_date, na.rm =TRUE))# Now create the plotfiltered_data|>ggplot(aes(x =floor_date_month_date, y =token_length))+geom_violin(aes(group =floor_date_month_date))+stat_summary(fun ="mean", geom ="point")+stat_summary(fun.data ="mean_se", geom ="errorbar", width =0.2)+theme_minimal()+labs( title ="Prompt length over time", x ="Date", y ="Prompt length (in tokens)", caption ="The dots represent the mean and the error bars the standard error of the mean.")+scale_x_date(limits =lim, labels =scales::label_date_short())
8.6 Anzahl der Interaktionen bei den Usern, die mit dem LLM interagieren
rect_data<-comp_semester_rects(n_interactions_w_llm_course_date_course_uni, col_date ="floor_date_month")n_interactions_w_llm_course_date_course_uni|>group_by(floor_date_month)|>summarise(n =n())|>ggplot(aes(x =floor_date_month, y =n))+geom_rect( data =rect_data,aes(xmin =xmin, xmax =xmax, ymin =ymin, ymax =ymax), fill ="grey", alpha =0.2, inherit.aes =FALSE# Essential to use the rect_data columns)+geom_line()+labs(x ="Date", y ="Number of interactions with LLM")
Show the code
# --- 1. Prepare Data ---# Your original data processing for the plotplot_data<-n_interactions_w_llm_course_date_course_uni|>group_by(floor_date_month)|>summarise(n =n())|>ungroup()# Ungroup after summarise for easier use with ggplot# --- 2. Determine Plot Range for Rectangles ---# Find the min/max year and n-count from your *processed* plot_datamin_date<-min(plot_data$floor_date_month, na.rm =TRUE)max_date<-max(plot_data$floor_date_month, na.rm =TRUE)min_year<-year(min_date)max_year<-year(max_date)# Determine the Y-axis bounds for the rectanglesy_min<-min(plot_data$n, na.rm =TRUE)y_max<-max(plot_data$n, na.rm =TRUE)# --- 3. Calculate the Rectangle Coordinates (rect_data) ---# Generate years for the rectangles, ensuring we cover the full range# including potentially starting a "winter" semester in the min_year-1# and ending in max_year+1rect_years<-seq(min_year-1, max_year+1)# Summer semester: March 1 (Y) to July 1 (Y)summer_rects<-tibble(year =rect_years)|>mutate( xmin =ymd(paste0(year, "-03-01")), xmax =ymd(paste0(year, "-07-01")))# Winter semester: October 1 (Y) to February 1 (Y+1)winter_rects<-tibble(year =rect_years)|>mutate( xmin =ymd(paste0(year, "-10-01")), xmax =ymd(paste0(year+1, "-02-01")))# Combine, set Y bounds, and filter to the actual plot arearect_data<-bind_rows(summer_rects, winter_rects)|>mutate(ymin =y_min, ymax =y_max)|># Only keep rectangles that are fully or partially within the plot's X rangefilter(xmin<=max_date,xmax>=min_date)# --- 4. Generate the Final Plot ---plot_data|>ggplot(aes(x =floor_date_month, y =n))+# Add the transparent grey rectangles firstgeom_rect( data =rect_data,aes(xmin =xmin, xmax =xmax, ymin =ymin, ymax =ymax), fill ="grey", alpha =0.2, inherit.aes =FALSE# Essential to use the rect_data columns)+# Then plot the lines and points on topgeom_line()+geom_point()+# Added point layer for clarity at each monththeme_minimal()+labs( title ="Number of Interactions with LLM per Course Date per University", x ="Date", y ="Number of Interactions")
8.7 Klick auf ein Wort im Transkript
Ausgewertet wird im Folgenden die Variable “click_transcript_word”.
click_transcript_word_per_month<-data_separated_filtered|># rm all groups WITHOUT "click_transcript_word":group_by(idvisit)|>filter(!any(value =str_detect(value, "click_transcript_word")))|>ungroup()|>mutate(date_visit =ymd_hms(value))|>mutate(month_visit =floor_date(date_visit, unit ="month"))|>drop_na(date_visit)|>group_by(idvisit)|>slice(1)|>ungroup()|>count(month_visit)click_transcript_word_per_month
Show the code
rect_data_word_per_month<-comp_semester_rects(click_transcript_word_per_month, col_date ="month_visit")click_transcript_word_per_month|>ggplot(aes(x =month_visit, y =n))+geom_rect( data =rect_data,aes(xmin =xmin, xmax =xmax, ymin =ymin, ymax =Inf), fill ="grey", alpha =0.2, inherit.aes =FALSE# Essential to use the rect_data columns)+geom_line()+geom_smooth(method ="loess", se =FALSE, color ="blue", alpha =0.7)+scale_x_date(labels =scales::label_date_short())
8.7.2.2 fingerprint unique
Show the code
click_transcript_word_per_month_fingerprint<-data_separated_filtered|># rm all groups WITHOUT "click_transcript_word":group_by(fingerprint)|>filter(!any(value =str_detect(value, "click_transcript_word")))|>ungroup()|>mutate(date_visit =ymd_hms(value))|>mutate(month_visit =floor_date(date_visit, unit ="month"))|>drop_na(date_visit)|>group_by(fingerprint)|>slice(1)|>ungroup()|>count(month_visit)click_transcript_word_per_month_fingerprint
Show the code
click_transcript_word_per_month_fingerprint|>ggplot(aes(x =month_visit, y =n))+geom_rect( data =rect_data,aes(xmin =xmin, xmax =xmax, ymin =ymin, ymax =Inf), fill ="grey", alpha =0.2, inherit.aes =FALSE# Essential to use the rect_data columns)+geom_line()+geom_smooth(method ="loess", se =FALSE, color ="blue", alpha =0.7)
regex_pattern<-"Category: \"(.*?)(?=', Action)"# Explaining this regex_pattern:# Find the literal string# 1. `Category: ` (surrounded by quotation marks)# 2. Capture any characters (.*?) that follow, non-greedily, until...# 3. ...it encounters the literal sequence, ` Action`) immediately after the captured string.ai_actions_count<-data_long|># slice(1:1000) |>filter(str_detect(value, "transcript"))|>mutate(category =str_extract(value, regex_pattern))|>select(category)|>mutate(category =str_replace_all(category, "[\"']", ""))|>count(category, sort =TRUE)ai_actions_count|>tt()
category
n
NA
217862
Category: clear_transcript_text_for_llm_context
104111
Category: click_transcript_word
8439
Category: select_transcript_text_for_llm_context
576
Category: click_button
43
Category: llm_response_de
3
Category: llm_response_en
3
8.8.3 KI-Klicks pro Monat
Im Objekt wird gezählt, wie oft der String "click_transcript_word" in den Daten (Langformat) gefunden wird, s. Target ai_transcript_clicks_per_month in der Targets-Pipeline.
rect_data<-comp_semester_rects(ai_transcript_clicks_per_month_count, col_date ="year_month")# try common lubridate parsers (datetime -> date)ai_transcript_clicks_per_month_count|>mutate(date =ymd(paste0(year_month, "-01")))|>ggplot(aes(x =date, y =n))+geom_rect( data =rect_data,aes(xmin =xmin, xmax =xmax, ymin =ymin, ymax =ymax), fill ="grey", alpha =0.2, inherit.aes =FALSE# Essential to use the rect_data columns)+geom_line(group =1)+geom_point()+theme_minimal()+labs( title ="Number of AI transcript clicks per month", x ="date [months]")+geom_smooth(method ="loess", se =FALSE, color ="blue", alpha =0.2)
Show the code
# --- 1. Prepare Data and Determine Year Range ---# We assign the processed data to a temporary variable to calculate the year range.ai_clicks_data<-ai_transcript_clicks_per_month_count|>mutate(date =ymd(paste0(year_month, "-01")))# --- 2. Calculate the Vertical Line Intercept Dates ---# Find the min/max year in the datamin_year<-min(year(ai_clicks_data$date), na.rm =TRUE)max_year<-max(year(ai_clicks_data$date), na.rm =TRUE)years<-seq(min_year, max_year)# Define the target months: February (2), March (3), July (7), October (10)vline_dates<-expand.grid( year =years, month =c(2, 3, 7, 10))|>mutate( date_str =paste0(year, "-", month, "-01"), vline_date =ymd(date_str))|># Filter to only include dates within the actual data range for plottingfilter(vline_date>=min(ai_clicks_data$date)&vline_date<=max(ai_clicks_data$date))|>pull(vline_date)|>as.Date()# --- 3. Generate the Final Plot ---ai_clicks_data|>ggplot(aes(x =date, y =n))+# Add the vertical linesgeom_vline( xintercept =vline_dates, color ="darkred", linetype ="dashed", alpha =0.6)+geom_line(group =1)+geom_point()+theme_minimal()+labs(title ="Number of AI transcript clicks per month", x ="date [months]")+geom_smooth(method ="loess", se =FALSE, color ="blue", alpha =0.5)
Show the code
# --- 1. Prepare Data and Determine Year Range ---# We assign the processed data to a temporary variable to calculate the year range.ai_clicks_data<-ai_transcript_clicks_per_month_count|>mutate(date =ymd(paste0(year_month, "-01")))# Find the min/max year and n-count in the datamin_year<-min(year(ai_clicks_data$date), na.rm =TRUE)max_year<-max(year(ai_clicks_data$date), na.rm =TRUE)years<-seq(min_year, max_year)# Determine the Y-axis bounds for the rectanglesy_min<-min(ai_clicks_data$n, na.rm =TRUE)y_max<-max(ai_clicks_data$n, na.rm =TRUE)# --- 2. Calculate the Rectangle Coordinates (rect_data) ---# Period 1: March 1 (Y) to July 1 (Y)rect_data_mar_jul<-tibble(year =years)|>mutate( xmin =ymd(paste0(year, "-03-01")), xmax =ymd(paste0(year, "-07-01")))# Period 2: October 1 (Y) to February 1 (Y+1)# We need to include max_year + 1 in the sequence to capture the end datesrect_data_oct_feb<-tibble(year =seq(min_year, max_year))|>mutate( xmin =ymd(paste0(year, "-10-01")), xmax =ymd(paste0(year+1, "-02-01")))# Combine, set Y bounds, and filter to the actual plot arearect_data<-bind_rows(rect_data_mar_jul, rect_data_oct_feb)|>mutate(ymin =y_min, ymax =y_max)|># Only keep rectangles that are fully or partially within the plot's X rangefilter(xmin<=max(ai_clicks_data$date, na.rm =TRUE),xmax>=min(ai_clicks_data$date, na.rm =TRUE))# --- 3. Generate the Final Plot ---ai_clicks_data|>ggplot(aes(x =date, y =n))+# Add the transparent grey rectangles firstgeom_rect( data =rect_data,aes(xmin =xmin, xmax =xmax, ymin =ymin, ymax =ymax), fill ="grey", alpha =0.2, inherit.aes =FALSE# Essential to use the rect_data columns)+# Then plot the lines and points on topgeom_line(group =1)+geom_point()+theme_minimal()+labs(title ="Number of AI transcript clicks per month", x ="date [months]")+geom_smooth(method ="loess", se =FALSE, color ="blue", alpha =0.5)
8.9 Output des LLMs: llm_response - Tokens und Tokenlänge
llm_response_text_date_course_uni|>ggboxplot( x ="university", y ="tokens_n", add ="mean_se")+theme_minimal()+labs( title ="Number of tokens in LLM responses by university", x ="University", y ="Number of tokens")+coord_flip()
llm_response_text_date_course_uni|>ggboxplot( x ="course", y ="tokens_n", add ="mean_se")+theme_minimal()+labs( title ="Number of tokens in LLM responses by course", x ="Course", y ="Number of tokens")+coord_flip()
8.9.5 Anzahl vorab existierender Fragen
8.9.5.1 Anzahl verify_option_wrong und verify_option_correct
8.9.5.1.1 idvisit
Show the code
verify_option_summary<-as.data.table(data_separated_filtered)[,.(idvisit, value)# keep only needed columns][value%chin%c("verify_option_wrong", "verify_option_correct"), # filter.(verify_option =.N), # summarise count by =idvisit]verify_option_summary<-as_tibble(verify_option_summary)
setDT(generate_questionaire_summary)setDT(verify_option_summary)# 1. Full Join (Merge)# Use the 'merge' function with all.x=TRUE and all.y=TRUE for a full join# Assumes the join column is 'idvisit' as used in your previous examplesprior_existing_questions_summary<-merge(generate_questionaire_summary,verify_option_summary, by ="idvisit", all =TRUE)# 2. Mutate (Calculation)# Use .j to create the new columnprior_existing_questions_summary[,prior_existing_questions_n:=verify_option-generate_questionaire]# prior_existing_questions_summary <-# generate_questionaire_summary |># full_join(verify_option_summary) |># mutate(prior_existing_questions_n = verify_option - generate_questionaire)
prompt_length|>ggplot(aes(x =token_length))+geom_histogram(binwidth =10)+labs( title ="Length of prompts sent to the LLM", x ="Prompt length (in tokens)", y ="Number of prompts")+theme_minimal()