Skip to content

Commit 1579c40

Browse files
committed
Scrape metadata from Geoportal Berlin
links to services in nested table
1 parent 8c8604e commit 1579c40

12 files changed

Lines changed: 446 additions & 2 deletions

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,3 +3,4 @@
33
.RData
44
.Ruserdata
55
docs
6+
inst/doc

DESCRIPTION

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,5 +14,13 @@ BugReports: https://github.com/KWB-R/kwb.geoportal/issues
1414
Encoding: UTF-8
1515
Roxygen: list(markdown = TRUE)
1616
RoxygenNote: 7.3.2
17+
Imports:
18+
purrr,
19+
tibble,
20+
xml2
1721
Suggests:
18-
covr
22+
covr,
23+
DT,
24+
knitr,
25+
rmarkdown
26+
VignetteBuilder: knitr

NAMESPACE

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,15 @@
11
# Generated by roxygen2: do not edit by hand
22

3+
export("%||%")
4+
export(parse_gn_link)
5+
export(read_metadata)
6+
export(read_metadata_all)
7+
importFrom(dplyr,bind_rows)
8+
importFrom(purrr,map_dfr)
9+
importFrom(stats,setNames)
10+
importFrom(tibble,tibble)
11+
importFrom(xml2,read_xml)
12+
importFrom(xml2,xml_attr)
13+
importFrom(xml2,xml_find_all)
14+
importFrom(xml2,xml_find_first)
15+
importFrom(xml2,xml_text)

R/read_metadata.R

Lines changed: 167 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,167 @@
1+
#' Null-coalescing helper
2+
#'
3+
#' Small utility to replace `NULL` or zero-length objects by a default value.
4+
#' This is handy when parsing GeoNetwork JSON/XML responses where some
5+
#' elements are simply missing.
6+
#'
7+
#' @param x Any object that might be `NULL` or of length 0.
8+
#' @param y Fallback value to be returned when `x` is `NULL` or length 0.
9+
#'
10+
#' @return Either `x` (when present) or `y` (when `x` is missing).
11+
#'
12+
#' @keywords internal
13+
#' @export
14+
`%||%` <- function(x, y) if (is.null(x) || length(x) == 0) y else x
15+
16+
17+
#' Parse a single GeoNetwork link element
18+
#'
19+
#' GeoNetwork often encodes links as a single string separated by `|`, e.g.:
20+
#' `|Darstellungsdienst (WMS)|https://...|OGC:WMS|||`.
21+
#' This helper splits such a string into named columns and pads missing parts
22+
#' up to 6 elements.
23+
#'
24+
#' The order used here is:
25+
#' 1. link name
26+
#' 2. link description
27+
#' 3. link URL
28+
#' 4. link protocol (e.g. `"OGC:WMS"`)
29+
#' 5. MIME type
30+
#' 6. order
31+
#'
32+
#' Note: In many Berlin GDI records the **first** field (link name) is empty,
33+
#' and the actual meaningful text is in the **second** field (description).
34+
#'
35+
#' @param x Character string as found inside a `<link>` XML node.
36+
#'
37+
#' @return A one-row tibble with columns:
38+
#' \itemize{
39+
#' \item `link_name`
40+
#' \item `link_desc`
41+
#' \item `link_url`
42+
#' \item `link_protocol`
43+
#' \item `link_mime`
44+
#' \item `link_order`
45+
#' }
46+
#'
47+
#' @examples
48+
#' parse_gn_link("|Darstellungsdienst (WMS)|https://example.org/wms?|OGC:WMS|||")
49+
#'
50+
#' @importFrom tibble tibble
51+
#' @export
52+
parse_gn_link <- function(x) {
53+
if (is.null(x) || length(x) == 0) {
54+
x <- ""
55+
}
56+
parts <- strsplit(x, "\\|")[[1]]
57+
# pad / trim to 6 elements
58+
if (length(parts) < 6) {
59+
parts <- c(parts, rep(NA_character_, 6 - length(parts)))
60+
} else if (length(parts) > 6) {
61+
parts <- parts[1:6]
62+
}
63+
tibble::tibble(
64+
link_name = parts[1],
65+
link_desc = parts[2],
66+
link_url = parts[3],
67+
link_protocol = parts[4],
68+
link_mime = parts[5],
69+
link_order = parts[6]
70+
)
71+
}
72+
73+
74+
#' Read GeoNetwork service metadata (one row per service)
75+
#'
76+
#' This function reads a GeoNetwork XML search response (as delivered by
77+
#' `https://gdi.berlin.de/geonetwork/srv/ger/q?...`) and converts it into a
78+
#' tidy tibble with **one row per metadata record**.
79+
#' All `<link>` elements of a record are kept together in a **list column**
80+
#' called `links`, where each entry is itself a tibble created by
81+
#' [parse_gn_link()].
82+
#'
83+
#' This structure is convenient when you want to keep the dataset-level
84+
#' information (title, abstract, uuid, ...) together, but still be able to
85+
#' inspect or unnest all service/download/view links later on.
86+
#'
87+
#' @param path_xml Path or URL to the GeoNetwork XML document. This can be a
88+
#' local file (e.g. `"geoportal_metadaten.xml"`) or a remote URL such as
89+
#' `"https://gdi.berlin.de/geonetwork/srv/ger/q?..."`.
90+
#'
91+
#' @return A tibble with one row per metadata record and the columns:
92+
#' \describe{
93+
#' \item{geonet_uuid}{UUID from `<geonet:info><uuid>` (character).}
94+
#' \item{geonet_id}{Internal GeoNetwork id from `<geonet:info><id>` (character).}
95+
#' \item{title}{Dataset/service title.}
96+
#' \item{abstract}{Dataset/service abstract/description.}
97+
#' \item{serviceType}{Service type, if present (e.g. WMS).}
98+
#' \item{types}{Semicolon-separated `<type>` elements.}
99+
#' \item{source_logo}{Logo path, if present.}
100+
#' \item{links}{List column; each element is a tibble with the parsed links.}
101+
#' }
102+
#'
103+
#' @details
104+
#' The function assumes a GeoNetwork-style XML with `<metadata>` elements and
105+
#' the namespace `geonet:` available for the info block.
106+
#' It is tailored to the GDI Berlin instance but should work for other similar
107+
#' GeoNetwork responses that use the same link encoding (`|` separated).
108+
#'
109+
#' If a record has **no** `<link>` elements, the `links` column will contain
110+
#' a single-row tibble with all `NA` values. This preserves the 1:1 alignment
111+
#' between records and rows.
112+
#'
113+
#' @examples
114+
#' \dontrun{
115+
#' df <- read_geonetwork_services(
116+
#' "https://gdi.berlin.de/geonetwork/srv/ger/q?facet.q=type/service&resultType=details&sortBy=changeDate&from=1&to=100&fast=index"
117+
#' )
118+
#' dplyr::glimpse(df)
119+
#' df$links[[1]]
120+
#' }
121+
#'
122+
#' @importFrom xml2 read_xml xml_find_all xml_find_first xml_text
123+
#' @importFrom purrr map_dfr
124+
#' @importFrom tibble tibble
125+
#' @importFrom stats setNames
126+
#' @export
127+
read_metadata <- function(path_xml) {
128+
doc <- xml2::read_xml(path_xml)
129+
mds <- xml2::xml_find_all(doc, ".//metadata")
130+
131+
purrr::map_dfr(mds, function(md) {
132+
uuid <- xml2::xml_text(xml2::xml_find_first(md, ".//geonet:info/uuid"))
133+
id <- xml2::xml_text(xml2::xml_find_first(md, ".//geonet:info/id"))
134+
title <- xml2::xml_text(xml2::xml_find_first(md, ".//title"))
135+
abst <- xml2::xml_text(xml2::xml_find_first(md, ".//abstract"))
136+
logo <- xml2::xml_text(xml2::xml_find_first(md, ".//logo"))
137+
service_type <- xml2::xml_text(xml2::xml_find_first(md, ".//serviceType"))
138+
type_nodes <- xml2::xml_find_all(md, ".//type")
139+
types <- paste(xml2::xml_text(type_nodes), collapse = ";")
140+
141+
link_nodes <- xml2::xml_find_all(md, ".//link")
142+
143+
links_tbl <- if (length(link_nodes) == 0) {
144+
list(tibble::tibble(
145+
link_name = NA_character_,
146+
link_desc = NA_character_,
147+
link_url = NA_character_,
148+
link_protocol = NA_character_,
149+
link_mime = NA_character_,
150+
link_order = NA_character_
151+
))
152+
} else {
153+
list(purrr::map_dfr(link_nodes, ~parse_gn_link(xml2::xml_text(.x))))
154+
}
155+
156+
tibble::tibble(
157+
geonet_uuid = ifelse(uuid == "", NA_character_, uuid),
158+
geonet_id = ifelse(id == "", NA_character_, id),
159+
title = ifelse(title == "", NA_character_, title),
160+
abstract = ifelse(abst == "", NA_character_, abst),
161+
serviceType = ifelse(service_type == "", NA_character_, service_type),
162+
types = ifelse(types == "", NA_character_, types),
163+
source_logo = ifelse(logo == "", NA_character_, logo),
164+
links = links_tbl
165+
)
166+
})
167+
}

R/read_metadata_all.R

Lines changed: 61 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,61 @@
1+
#' Read all GeoNetwork / Geoportal metadata in chunks
2+
#'
3+
#' This function uses [read_metadata()] repeatedly to fetch **all** available
4+
#' metadata records from a GeoNetwork endpoint that supports `from` / `to`
5+
#' pagination (like the GDI Berlin instance).
6+
#'
7+
#' It first downloads the initial XML, reads the `<summary count="...">`
8+
#' attribute to know how many records exist, then iterates in chunks
9+
#' (default: 100) until all records are read.
10+
#'
11+
#' @param base_url Base GeoNetwork query URL **without** `from` and `to`
12+
#' parameters. Must return an XML with a `<summary count="...">` node.
13+
#' Defaults to the GDI Berlin service search.
14+
#' @param chunk_size Number of records per request. Default: 100.
15+
#'
16+
#' @return A tibble with one row per metadata record, identical in structure
17+
#' to the return value of [read_metadata()], but for **all** pages.
18+
#'
19+
#' @examples
20+
#' \dontrun{
21+
#' all_md <- read_metadata_all()
22+
#' nrow(all_md)
23+
#' }
24+
#'
25+
#' @seealso [read_metadata()]
26+
#' @importFrom xml2 read_xml xml_find_first xml_attr
27+
#' @importFrom dplyr bind_rows
28+
#' @export
29+
read_metadata_all <- function(
30+
base_url = "https://gdi.berlin.de/geonetwork/srv/ger/q?facet.q=type/service&resultType=details&sortBy=changeDate&fast=index",
31+
chunk_size = 100
32+
) {
33+
# 1) Erstes Dokument holen, nur um die summary zu lesen
34+
doc0 <- xml2::read_xml(base_url)
35+
summary_node <- xml2::xml_find_first(doc0, ".//summary")
36+
total_count <- as.integer(xml2::xml_attr(summary_node, "count"))
37+
38+
if (is.na(total_count)) {
39+
stop("Could not read <summary count=\"...\"> from GDI Berlin GeoNetwork response.")
40+
}
41+
42+
# 2) Sequenzen bilden: 1..total_count in chunk_size-Schritten
43+
from_vals <- seq(1, total_count, by = chunk_size)
44+
to_vals <- pmin(from_vals + chunk_size - 1, total_count)
45+
46+
# 3) Alle Chunks abrufen und parsen
47+
res_list <- vector("list", length(from_vals))
48+
49+
for (i in seq_along(from_vals)) {
50+
from_i <- from_vals[i]
51+
to_i <- to_vals[i]
52+
53+
url_i <- sprintf("%s&from=%d&to=%d", base_url, from_i, to_i)
54+
55+
# vorhandene Parserfunktion wiederverwenden
56+
res_list[[i]] <- read_metadata(url_i)
57+
}
58+
59+
# 4) alles zusammenführen
60+
dplyr::bind_rows(res_list)
61+
}

kwb.geoportal.Rproj

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,4 +19,4 @@ LineEndingConversion: Posix
1919
BuildType: Package
2020
PackageUseDevtools: Yes
2121
PackageInstallArgs: --no-multiarch --with-keep.source
22-
PackageRoxygenize: rd,collate,namespace
22+
PackageRoxygenize: rd,collate,namespace,vignette

man/grapes-or-or-grapes.Rd

Lines changed: 22 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

man/parse_gn_link.Rd

Lines changed: 46 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

0 commit comments

Comments
 (0)