Skip to content

Commit ad37387

Browse files
authored
🔖 Version 0.5.0 (#80)
2 parents 2e58657 + a954bae commit ad37387

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

43 files changed

+682
-249
lines changed

.github/workflows/pr_title.yml

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
name: 'PR Title Checker'
2+
on:
3+
pull_request:
4+
types: [edited, opened, synchronize, reopened]
5+
branches:
6+
- main
7+
- master
8+
9+
jobs:
10+
title-check:
11+
runs-on: ubuntu-latest
12+
steps:
13+
- uses: naveenk1223/action-pr-title@master
14+
with:
15+
regex: 'Version [0-9]+\.[0-9]+(\.[0-9]+)?$' # Regex the title should match.
16+
allowed_prefixes: ':bookmark:' # title should start with the given prefix
17+
prefix_case_sensitive: true # title prefix are case insensitive
18+
min_length: 11 # Min length of the title
19+
max_length: -1 # Max length of the title
20+
name: Check PR title

.zenodo.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
{
22
"title": "git2rdata: Store and Retrieve Data.frames in a Git Repository",
3-
"version": "0.4.1",
3+
"version": "0.5.0",
44
"license": "GPL-3.0",
55
"upload_type": "software",
66
"description": "<p>The git2rdata package is an R package for writing and reading dataframes as plain text files. A metadata file stores important information. 1) Storing metadata allows to maintain the classes of variables. By default, git2rdata optimizes the data for file storage. The optimization is most effective on data containing factors. The optimization makes the data less human readable. The user can turn this off when they prefer a human readable format over smaller files. Details on the implementation are available in vignette(“plain_text”, package = “git2rdata”). 2) Storing metadata also allows smaller row based diffs between two consecutive commits. This is a useful feature when storing data as plain text files under version control. Details on this part of the implementation are available in vignette(“version_control”, package = “git2rdata”). Although we envisioned git2rdata with a git workflow in mind, you can use it in combination with other version control systems like subversion or mercurial. 3) git2rdata is a useful tool in a reproducible and traceable workflow. vignette(“workflow”, package = “git2rdata”) gives a toy example. 4) vignette(“efficiency”, package = “git2rdata”) provides some insight into the efficiency of file storage, git repository size and speed for writing and reading.<\/p>",

CITATION.cff

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -40,4 +40,4 @@ identifiers:
4040
value: 10.5281/zenodo.1485309
4141
- type: url
4242
value: https://ropensci.github.io/git2rdata/
43-
version: 0.4.1
43+
version: 0.5.0

DESCRIPTION

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
Package: git2rdata
22
Title: Store and Retrieve Data.frames in a Git Repository
3-
Version: 0.4.1
3+
Version: 0.5.0
44
Authors@R: c(
55
person("Thierry", "Onkelinx", , "[email protected]", role = c("aut", "cre"),
66
comment = c(ORCID = "0000-0001-8804-4216", affiliation = "Research Institute for Nature and Forest (INBO)")),
@@ -38,14 +38,15 @@ URL: https://ropensci.github.io/git2rdata/,
3838
https://doi.org/10.5281/zenodo.1485309
3939
BugReports: https://github.com/ropensci/git2rdata/issues
4040
Depends:
41-
R (>= 3.5.0)
41+
R (>= 4.1.0)
4242
Imports:
4343
assertthat,
4444
git2r (>= 0.23.0),
4545
methods,
4646
yaml
4747
Suggests:
4848
ggplot2,
49+
jsonlite,
4950
knitr,
5051
microbenchmark,
5152
rmarkdown,
@@ -60,6 +61,7 @@ Roxygen: list(markdown = TRUE)
6061
RoxygenNote: 7.3.2
6162
Collate:
6263
'clean_data_path.R'
64+
'data_package.R'
6365
'datahash.R'
6466
'display_metadata.R'
6567
'git2rdata_package.R'

NAMESPACE

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,7 @@ S3method(write_vc,character)
5151
S3method(write_vc,default)
5252
S3method(write_vc,git_repository)
5353
export(commit)
54+
export(data_package)
5455
export(display_metadata)
5556
export(is_git2rdata)
5657
export(is_git2rmeta)
@@ -74,6 +75,7 @@ importFrom(assertthat,"on_failure<-")
7475
importFrom(assertthat,assert_that)
7576
importFrom(assertthat,has_attr)
7677
importFrom(assertthat,has_name)
78+
importFrom(assertthat,is.count)
7779
importFrom(assertthat,is.flag)
7880
importFrom(assertthat,is.string)
7981
importFrom(assertthat,noNA)

NEWS.md

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,10 @@
1+
# git2rdata 0.5.0
2+
3+
* `read_vc()` handles empty datasets stored with `split_by`.
4+
* `write_vc()` and `meta()` gain a `digits` argument.
5+
The arguments specifies the number of significant digits to store for numeric
6+
values.
7+
18
# git2rdata 0.4.1
29

310
* Add `update_metadata()` to update the description of a `git2rdata` object.

R/data_package.R

Lines changed: 107 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,107 @@
1+
#' Create a Data Package for a directory of CSV files
2+
#'
3+
#' @description
4+
#' Create a `datapackage.json` file for a directory of CSV files.
5+
#' The function will look for all `.csv` files in the directory and its
6+
#' subdirectories.
7+
#' It will then create a `datapackage.json` file with the metadata of each CSV
8+
#' file.
9+
#'
10+
#' @param path the directory in which to create the `datapackage.json` file.
11+
#' @family storage
12+
#' @export
13+
#' @importFrom assertthat assert_that is.string noNA
14+
data_package <- function(path = ".") {
15+
assert_that(
16+
is.string(path), noNA(path), requireNamespace("jsonlite", quietly = TRUE)
17+
)
18+
stopifnot("`path` is not a directory" = file_test("-d", path))
19+
20+
data_files <- list.files(path, pattern = ".csv$", recursive = TRUE)
21+
relevant <- vapply(
22+
data_files, FUN = is_git2rdata, FUN.VALUE = logical(1), root = path
23+
)
24+
stopifnot(
25+
"no non-optimized git2rdata objects found at `path`" = any(relevant)
26+
)
27+
data_files <- data_files[relevant]
28+
29+
list(
30+
resources = vapply(
31+
data_files, path = path, FUN = data_resource,
32+
FUN.VALUE = vector(mode = "list", length = 1)
33+
) |>
34+
unname()
35+
) |>
36+
jsonlite::toJSON(pretty = TRUE, auto_unbox = TRUE) |>
37+
writeLines(file.path(path, "datapackage.json"))
38+
return(file.path(path, "datapackage.json"))
39+
}
40+
41+
#' @importFrom assertthat assert_that is.string noNA
42+
#' @importFrom yaml read_yaml
43+
data_resource <- function(file, path = ".") {
44+
assert_that(
45+
is.string(file), is.string(path), noNA(file), noNA(path)
46+
)
47+
stopifnot("`path` is not a directory" = file_test("-d", path))
48+
49+
clean_data_path(root = path, file = file)[2] |>
50+
read_yaml() -> metadata
51+
list(
52+
name = coalesce(metadata[["..generic"]][["name"]], file), path = file,
53+
"encoding" = "utf-8", format = "csv", media_type = "text/csv",
54+
hash = paste0("sha1:", metadata[["..generic"]][["data_hash"]]),
55+
schema = list(
56+
fields = vapply(
57+
names(metadata)[-1], metadata = metadata, FUN = field_schema,
58+
FUN.VALUE = vector(mode = "list", length = 1)
59+
) |>
60+
unname(),
61+
missingValues = list(
62+
c(value = metadata[["..generic"]][["NA string"]], label = "missing")
63+
)
64+
)
65+
) -> dr
66+
extra <- c("title", "description")
67+
metadata[["..generic"]][extra[extra %in% names(metadata[["..generic"]])]] |>
68+
c(dr) |>
69+
list()
70+
}
71+
72+
field_schema <- function(x, metadata) {
73+
switch(
74+
metadata[[x]]$class,
75+
"character" = list(name = x, type = "string"),
76+
"Date" = list(name = x, type = "date"),
77+
"logical" = list(
78+
name = x, type = "boolean", trueValues = c("TRUE", "true"),
79+
falseValues = c("FALSE", "false")
80+
),
81+
"factor" = list(
82+
name = x, type = "string", categories = metadata[[x]][["labels"]],
83+
categoriesOrdered = metadata[[x]][["ordered"]]
84+
),
85+
"integer" = list(name = x, type = "integer"),
86+
"numeric" = list(name = x, type = "number"),
87+
"POSIXct" = list(
88+
name = x, type = "datetime", format = "%Y-%m-%dT%H:%M:%SZ"
89+
),
90+
stop("field_schema() can't handle ", metadata[[x]]$class)
91+
) -> fs
92+
if ("description" %in% names(metadata[[x]])) {
93+
fs$description <- metadata[[x]][["description"]]
94+
}
95+
return(list(fs))
96+
}
97+
98+
coalesce <- function(...) {
99+
dots <- list(...)
100+
if (length(dots) == 0) {
101+
return(NULL)
102+
}
103+
if (!is.null(dots[[1]])) {
104+
return(dots[[1]])
105+
}
106+
do.call(coalesce, dots[-1])
107+
}

R/meta.R

Lines changed: 45 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@
1414
#' @examples
1515
#' meta(c(NA, "'NA'", '"NA"', "abc\tdef", "abc\ndef"))
1616
#' meta(1:3)
17-
#' meta(seq(1, 3, length = 4))
17+
#' meta(seq(1, 3, length = 4), digits = 6)
1818
#' meta(factor(c("b", NA, "NA"), levels = c("NA", "b", "c")))
1919
#' meta(factor(c("b", NA, "a"), levels = c("a", "b", "c")), optimize = FALSE)
2020
#' meta(factor(c("b", NA, "a"), levels = c("a", "b", "c"), ordered = TRUE))
@@ -29,7 +29,7 @@
2929
#' meta(as.POSIXct("2019-02-01 10:59:59", tz = "CET"), optimize = FALSE)
3030
#' meta(as.Date("2019-02-01"))
3131
#' meta(as.Date("2019-02-01"), optimize = FALSE)
32-
meta <- function(x, ...) {
32+
meta <- function(x, ..., digits) {
3333
UseMethod("meta", x)
3434
}
3535

@@ -63,8 +63,11 @@ meta.integer <- function(x, ...) {
6363
}
6464

6565
#' @export
66-
meta.numeric <- function(x, ...) {
67-
list(class = "numeric") -> m
66+
#' @importFrom assertthat assert_that is.count
67+
meta.numeric <- function(x, ..., digits) {
68+
stopifnot("`digits` must be a strict positive integer" = is.count(digits))
69+
x <- signif(x, digits = digits)
70+
list(class = "numeric", digits = as.integer(digits)) -> m
6871
class(m) <- "meta_detail"
6972
attr(x, "meta") <- m
7073
return(x)
@@ -218,7 +221,7 @@ meta.Date <- function(x, optimize = TRUE, ...) {
218221
#' @inheritParams write_vc
219222
meta.data.frame <- function(# nolint
220223
x, optimize = TRUE, na = "NA", sorting, strict = TRUE,
221-
split_by = character(0), ...
224+
split_by = character(0), ..., digits
222225
) {
223226
assert_that(
224227
!has_name(x, "..generic"),
@@ -237,13 +240,46 @@ meta.data.frame <- function(# nolint
237240
)
238241

239242
dots <- list(...)
243+
float <- vapply(x, is.numeric, logical(1)) &
244+
!vapply(x, is.integer, logical(1))
240245
if (has_name(dots, "old")) {
241246
old <- dots$old
242247
assert_that(inherits(old, "meta_list"))
243248
if (missing(sorting)) {
244249
sorting <- old[["..generic"]][["sorting"]]
245250
}
251+
if (any(float) && missing(digits)) {
252+
old_numeric <- vapply(
253+
old, FUN.VALUE = logical(1),
254+
FUN = function(x) {
255+
has_name(x, "class") && x$class == "numeric" && has_name(x, "digits")
256+
}
257+
)
258+
digits <- vapply(
259+
old[old_numeric], FUN.VALUE = numeric(1),
260+
FUN = function(x) {
261+
x[["digits"]]
262+
}
263+
)
264+
relevant <- names(float)[float][!names(float)[float] %in% names(digits)]
265+
rep(6L, length(relevant)) -> digits[relevant]
266+
}
267+
}
268+
if (any(float) && missing(digits)) {
269+
digits <- 6L
270+
warning("`digits` was not set. Setting is automatically to 6. See ?meta")
246271
}
272+
if (any(float) && is.null(names(digits))) {
273+
stopifnot(
274+
"`digits` must be either named or have length 1" = length(digits) == 1
275+
)
276+
digits <- rep(digits, sum(float))
277+
names(digits) <- names(float)[float]
278+
}
279+
stopifnot(
280+
"`digits` must contain all numeric variables of `x`" =
281+
all(!float) || all(names(float)[float] %in% names(digits))
282+
)
247283

248284
# apply sorting
249285
if (missing(sorting) || is.null(sorting) || !length(sorting)) {
@@ -271,12 +307,13 @@ Add extra sorting variables to ensure small diffs.", sorted)
271307
if (length(split_by) > 0) {
272308
generic <- c(generic, split_by = list(split_by))
273309
}
310+
274311
# calculate meta for each column
275312
if (!has_name(dots, "old")) {
276313
z <- lapply(
277314
colnames(x),
278315
function(id, optimize, na) {
279-
meta(x[[id]], optimize = optimize, na = na)
316+
meta(x[[id]], optimize = optimize, na = na, digits = digits[[id]])
280317
},
281318
optimize = optimize, na = na
282319
)
@@ -290,7 +327,7 @@ Add extra sorting variables to ensure small diffs.", sorted)
290327
meta(
291328
x[[id]], optimize = optimize, na = na,
292329
index = setNames(old[[id]][["index"]], old[[id]][["labels"]]),
293-
strict = strict
330+
strict = strict, digits = digits[[id]]
294331
)
295332
},
296333
optimize = old[["..generic"]][["optimize"]],
@@ -305,7 +342,7 @@ Add extra sorting variables to ensure small diffs.", sorted)
305342
z_new <- lapply(
306343
new,
307344
function(id, optimize, na) {
308-
meta(x[[id]], optimize = optimize, na = na)
345+
meta(x[[id]], optimize = optimize, na = na, digits = digits[[id]])
309346
},
310347
optimize = optimize, na = na
311348
)

R/read_vc.R

Lines changed: 32 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -83,29 +83,38 @@ read_vc.character <- function(file, root = ".") {
8383
comment.char = "",
8484
stringsAsFactors = FALSE, fileEncoding = "UTF-8"
8585
)
86-
raw_data <- vapply(
87-
seq_len(nrow(index)),
88-
function(i) {
89-
rf <- file.path(file["raw_file"], paste0(index[i, "..hash"], ".tsv"))
90-
raw_data <- read.table(
91-
file = rf, header = TRUE, sep = "\t", quote = "\"",
92-
dec = ".", numerals = "warn.loss", na.strings = na_string,
93-
colClasses = setNames(
94-
col_type[col_classes[!which_split_by]],
95-
col_names[!which_split_by]
96-
),
97-
comment.char = "",
98-
stringsAsFactors = FALSE, fileEncoding = "UTF-8"
99-
)
100-
raw_data <- cbind(
101-
index[rep(i, nrow(raw_data)), split_by, drop = FALSE],
102-
raw_data
103-
)
104-
return(list(raw_data))
105-
},
106-
vector(mode = "list", length = 1)
107-
)
108-
raw_data <- do.call(rbind, raw_data)[, col_names]
86+
if (nrow(index) == 0) {
87+
list(
88+
character = character(0), factor = character(0), integer = integer(0),
89+
numeric = numeric(0)
90+
)[col_classes] |>
91+
setNames(col_names) |>
92+
as.data.frame() -> raw_data
93+
} else {
94+
raw_data <- vapply(
95+
seq_len(nrow(index)),
96+
function(i) {
97+
rf <- file.path(file["raw_file"], paste0(index[i, "..hash"], ".tsv"))
98+
raw_data <- read.table(
99+
file = rf, header = TRUE, sep = "\t", quote = "\"",
100+
dec = ".", numerals = "warn.loss", na.strings = na_string,
101+
colClasses = setNames(
102+
col_type[col_classes[!which_split_by]],
103+
col_names[!which_split_by]
104+
),
105+
comment.char = "",
106+
stringsAsFactors = FALSE, fileEncoding = "UTF-8"
107+
)
108+
raw_data <- cbind(
109+
index[rep(i, nrow(raw_data)), split_by, drop = FALSE],
110+
raw_data
111+
)
112+
return(list(raw_data))
113+
},
114+
vector(mode = "list", length = 1)
115+
)
116+
raw_data <- do.call(rbind, raw_data)[, col_names]
117+
}
109118
} else {
110119
raw_data <- read.table(
111120
file = file["raw_file"], header = TRUE, sep = ifelse(optimize, "\t", ","),

0 commit comments

Comments
 (0)