library(targets)
library(tidyverse)
library(tarchetypes) # für tar_files()
targets-multiple-data-files
Aufgabe
Schreiben Sie eine targets
Pipeline, die einen Ordner mit Datendateien beobachtet und sich aktualisiert, wenn neue Daten dazukommt. Die Pipeline soll die Datendateien importieren und zu einer Tabelle zusammenfügen und schließlich die Zeilen zählen.
Lösung
Die folgende Lösung ist stark inspiriert von diesem SO-Post.
Wir scheiben eine _targets.R
Datei mit folgendem Inhalt.
Zuerst das Setup:
Dann definieren wir Konstanten; hier den Pfad:
<- list()
path $data <- "data/" path
Aus Gründen der Ordnungsfreude haben wir eine Liste erstellt, in der dann alle möglichen Pfade abgelegt werden können.
Schließlich definieren wir die Pipeline. Hier spielt die Musik:
list(
tar_files(data_paths, path$data %>% list.files(full.names = TRUE, pattern = "csv")), # Liste der Daten-Dateien
tar_target(data_proc, data_paths %>% read_csv(), # Einlesen
pattern = map(data_paths)), # Über alle Elemente von data_paths iterieren, also über alle Datendateien
tar_target(n_row, nrow(data_proc)) # Zeilen zählen
)
[[1]]
[[1]]$data_paths_files
<tar_stem>
name: data_paths_files
description:
command:
path$data %>% list.files(full.names = TRUE, pattern = "csv")
format: rds
repository: local
iteration method: vector
error mode: stop
memory mode: auto
storage mode: worker
retrieval mode: auto
deployment mode: worker
priority: 0
resources:
list()
cue:
seed: TRUE
file: TRUE
iteration: TRUE
repository: TRUE
format: TRUE
depend: TRUE
command: TRUE
mode: always
packages:
tarchetypes
lubridate
forcats
stringr
dplyr
purrr
readr
tidyr
tibble
ggplot2
tidyverse
targets
stats
graphics
grDevices
utils
datasets
colorout
methods
base
library:
NULL
[[1]]$data_paths
<tar_pattern>
name: data_paths
description:
command:
data_paths_files
pattern:
map(data_paths_files)
format: file
repository: local
iteration method: vector
error mode: stop
memory mode: auto
storage mode: main
retrieval mode: main
deployment mode: main
priority: 0
resources:
list()
cue:
seed: TRUE
file: TRUE
iteration: TRUE
repository: TRUE
format: TRUE
depend: TRUE
command: TRUE
mode: thorough
packages:
character(0)
library:
NULL
[[2]]
<tar_pattern>
name: data_proc
description:
command:
data_paths %>% read_csv()
pattern:
map(data_paths)
format: rds
repository: local
iteration method: vector
error mode: stop
memory mode: auto
storage mode: worker
retrieval mode: auto
deployment mode: worker
priority: 0
resources:
list()
cue:
seed: TRUE
file: TRUE
iteration: TRUE
repository: TRUE
format: TRUE
depend: TRUE
command: TRUE
mode: thorough
packages:
tarchetypes
lubridate
forcats
stringr
dplyr
purrr
readr
tidyr
tibble
ggplot2
tidyverse
targets
stats
graphics
grDevices
utils
datasets
colorout
methods
base
library:
NULL
[[3]]
<tar_stem>
name: n_row
description:
command:
nrow(data_proc)
format: rds
repository: local
iteration method: vector
error mode: stop
memory mode: auto
storage mode: worker
retrieval mode: auto
deployment mode: worker
priority: 0
resources:
list()
cue:
seed: TRUE
file: TRUE
iteration: TRUE
repository: TRUE
format: TRUE
depend: TRUE
command: TRUE
mode: thorough
packages:
tarchetypes
lubridate
forcats
stringr
dplyr
purrr
readr
tidyr
tibble
ggplot2
tidyverse
targets
stats
graphics
grDevices
utils
datasets
colorout
methods
base
library:
NULL
Mit pattern = map(data_paths)
iterieren wir nicht nur über alle Elemente von data_path
, sondern fügen die Elemente auch zu einer Tabelle zusammen.
Hier ist die ganze Syntax noch einmal:
# _targets.R file
library(targets)
library(tidyverse)
library(tarchetypes)
<- list()
path $data <- "data/"
path
list(
tar_files(data_paths, path$data %>% list.files(full.names = TRUE, pattern = "csv")),
tar_target(data_proc, data_paths %>% read_csv(),
pattern = map(data_paths)),
tar_target(n_row, nrow(data_proc))
)
[[1]]
[[1]]$data_paths_files
<tar_stem>
name: data_paths_files
description:
command:
path$data %>% list.files(full.names = TRUE, pattern = "csv")
format: rds
repository: local
iteration method: vector
error mode: stop
memory mode: auto
storage mode: worker
retrieval mode: auto
deployment mode: worker
priority: 0
resources:
list()
cue:
seed: TRUE
file: TRUE
iteration: TRUE
repository: TRUE
format: TRUE
depend: TRUE
command: TRUE
mode: always
packages:
tarchetypes
lubridate
forcats
stringr
dplyr
purrr
readr
tidyr
tibble
ggplot2
tidyverse
targets
stats
graphics
grDevices
utils
datasets
colorout
methods
base
library:
NULL
[[1]]$data_paths
<tar_pattern>
name: data_paths
description:
command:
data_paths_files
pattern:
map(data_paths_files)
format: file
repository: local
iteration method: vector
error mode: stop
memory mode: auto
storage mode: main
retrieval mode: main
deployment mode: main
priority: 0
resources:
list()
cue:
seed: TRUE
file: TRUE
iteration: TRUE
repository: TRUE
format: TRUE
depend: TRUE
command: TRUE
mode: thorough
packages:
character(0)
library:
NULL
[[2]]
<tar_pattern>
name: data_proc
description:
command:
data_paths %>% read_csv()
pattern:
map(data_paths)
format: rds
repository: local
iteration method: vector
error mode: stop
memory mode: auto
storage mode: worker
retrieval mode: auto
deployment mode: worker
priority: 0
resources:
list()
cue:
seed: TRUE
file: TRUE
iteration: TRUE
repository: TRUE
format: TRUE
depend: TRUE
command: TRUE
mode: thorough
packages:
tarchetypes
lubridate
forcats
stringr
dplyr
purrr
readr
tidyr
tibble
ggplot2
tidyverse
targets
stats
graphics
grDevices
utils
datasets
colorout
methods
base
library:
NULL
[[3]]
<tar_stem>
name: n_row
description:
command:
nrow(data_proc)
format: rds
repository: local
iteration method: vector
error mode: stop
memory mode: auto
storage mode: worker
retrieval mode: auto
deployment mode: worker
priority: 0
resources:
list()
cue:
seed: TRUE
file: TRUE
iteration: TRUE
repository: TRUE
format: TRUE
depend: TRUE
command: TRUE
mode: thorough
packages:
tarchetypes
lubridate
forcats
stringr
dplyr
purrr
readr
tidyr
tibble
ggplot2
tidyverse
targets
stats
graphics
grDevices
utils
datasets
colorout
methods
base
library:
NULL
Categories:
- projectmgt
- targets
- repro
- string