|
| 1 | +#' Null-coalescing helper |
| 2 | +#' |
| 3 | +#' Small utility to replace `NULL` or zero-length objects by a default value. |
| 4 | +#' This is handy when parsing GeoNetwork JSON/XML responses where some |
| 5 | +#' elements are simply missing. |
| 6 | +#' |
| 7 | +#' @param x Any object that might be `NULL` or of length 0. |
| 8 | +#' @param y Fallback value to be returned when `x` is `NULL` or length 0. |
| 9 | +#' |
| 10 | +#' @return Either `x` (when present) or `y` (when `x` is missing). |
| 11 | +#' |
| 12 | +#' @keywords internal |
| 13 | +#' @export |
| 14 | +`%||%` <- function(x, y) if (is.null(x) || length(x) == 0) y else x |
| 15 | + |
| 16 | + |
| 17 | +#' Parse a single GeoNetwork link element |
| 18 | +#' |
| 19 | +#' GeoNetwork often encodes links as a single string separated by `|`, e.g.: |
| 20 | +#' `|Darstellungsdienst (WMS)|https://...|OGC:WMS|||`. |
| 21 | +#' This helper splits such a string into named columns and pads missing parts |
| 22 | +#' up to 6 elements. |
| 23 | +#' |
| 24 | +#' The order used here is: |
| 25 | +#' 1. link name |
| 26 | +#' 2. link description |
| 27 | +#' 3. link URL |
| 28 | +#' 4. link protocol (e.g. `"OGC:WMS"`) |
| 29 | +#' 5. MIME type |
| 30 | +#' 6. order |
| 31 | +#' |
| 32 | +#' Note: In many Berlin GDI records the **first** field (link name) is empty, |
| 33 | +#' and the actual meaningful text is in the **second** field (description). |
| 34 | +#' |
| 35 | +#' @param x Character string as found inside a `<link>` XML node. |
| 36 | +#' |
| 37 | +#' @return A one-row tibble with columns: |
| 38 | +#' \itemize{ |
| 39 | +#' \item `link_name` |
| 40 | +#' \item `link_desc` |
| 41 | +#' \item `link_url` |
| 42 | +#' \item `link_protocol` |
| 43 | +#' \item `link_mime` |
| 44 | +#' \item `link_order` |
| 45 | +#' } |
| 46 | +#' |
| 47 | +#' @examples |
| 48 | +#' parse_gn_link("|Darstellungsdienst (WMS)|https://example.org/wms?|OGC:WMS|||") |
| 49 | +#' |
| 50 | +#' @importFrom tibble tibble |
| 51 | +#' @export |
| 52 | +parse_gn_link <- function(x) { |
| 53 | + if (is.null(x) || length(x) == 0) { |
| 54 | + x <- "" |
| 55 | + } |
| 56 | + parts <- strsplit(x, "\\|")[[1]] |
| 57 | + # pad / trim to 6 elements |
| 58 | + if (length(parts) < 6) { |
| 59 | + parts <- c(parts, rep(NA_character_, 6 - length(parts))) |
| 60 | + } else if (length(parts) > 6) { |
| 61 | + parts <- parts[1:6] |
| 62 | + } |
| 63 | + tibble::tibble( |
| 64 | + link_name = parts[1], |
| 65 | + link_desc = parts[2], |
| 66 | + link_url = parts[3], |
| 67 | + link_protocol = parts[4], |
| 68 | + link_mime = parts[5], |
| 69 | + link_order = parts[6] |
| 70 | + ) |
| 71 | +} |
| 72 | + |
| 73 | + |
| 74 | +#' Read GeoNetwork service metadata (one row per service) |
| 75 | +#' |
| 76 | +#' This function reads a GeoNetwork XML search response (as delivered by |
| 77 | +#' `https://gdi.berlin.de/geonetwork/srv/ger/q?...`) and converts it into a |
| 78 | +#' tidy tibble with **one row per metadata record**. |
| 79 | +#' All `<link>` elements of a record are kept together in a **list column** |
| 80 | +#' called `links`, where each entry is itself a tibble created by |
| 81 | +#' [parse_gn_link()]. |
| 82 | +#' |
| 83 | +#' This structure is convenient when you want to keep the dataset-level |
| 84 | +#' information (title, abstract, uuid, ...) together, but still be able to |
| 85 | +#' inspect or unnest all service/download/view links later on. |
| 86 | +#' |
| 87 | +#' @param path_xml Path or URL to the GeoNetwork XML document. This can be a |
| 88 | +#' local file (e.g. `"geoportal_metadaten.xml"`) or a remote URL such as |
| 89 | +#' `"https://gdi.berlin.de/geonetwork/srv/ger/q?..."`. |
| 90 | +#' |
| 91 | +#' @return A tibble with one row per metadata record and the columns: |
| 92 | +#' \describe{ |
| 93 | +#' \item{geonet_uuid}{UUID from `<geonet:info><uuid>` (character).} |
| 94 | +#' \item{geonet_id}{Internal GeoNetwork id from `<geonet:info><id>` (character).} |
| 95 | +#' \item{title}{Dataset/service title.} |
| 96 | +#' \item{abstract}{Dataset/service abstract/description.} |
| 97 | +#' \item{serviceType}{Service type, if present (e.g. WMS).} |
| 98 | +#' \item{types}{Semicolon-separated `<type>` elements.} |
| 99 | +#' \item{source_logo}{Logo path, if present.} |
| 100 | +#' \item{links}{List column; each element is a tibble with the parsed links.} |
| 101 | +#' } |
| 102 | +#' |
| 103 | +#' @details |
| 104 | +#' The function assumes a GeoNetwork-style XML with `<metadata>` elements and |
| 105 | +#' the namespace `geonet:` available for the info block. |
| 106 | +#' It is tailored to the GDI Berlin instance but should work for other similar |
| 107 | +#' GeoNetwork responses that use the same link encoding (`|` separated). |
| 108 | +#' |
| 109 | +#' If a record has **no** `<link>` elements, the `links` column will contain |
| 110 | +#' a single-row tibble with all `NA` values. This preserves the 1:1 alignment |
| 111 | +#' between records and rows. |
| 112 | +#' |
| 113 | +#' @examples |
| 114 | +#' \dontrun{ |
| 115 | +#' df <- read_geonetwork_services( |
| 116 | +#' "https://gdi.berlin.de/geonetwork/srv/ger/q?facet.q=type/service&resultType=details&sortBy=changeDate&from=1&to=100&fast=index" |
| 117 | +#' ) |
| 118 | +#' dplyr::glimpse(df) |
| 119 | +#' df$links[[1]] |
| 120 | +#' } |
| 121 | +#' |
| 122 | +#' @importFrom xml2 read_xml xml_find_all xml_find_first xml_text |
| 123 | +#' @importFrom purrr map_dfr |
| 124 | +#' @importFrom tibble tibble |
| 125 | +#' @importFrom stats setNames |
| 126 | +#' @export |
| 127 | +read_metadata <- function(path_xml) { |
| 128 | + doc <- xml2::read_xml(path_xml) |
| 129 | + mds <- xml2::xml_find_all(doc, ".//metadata") |
| 130 | + |
| 131 | + purrr::map_dfr(mds, function(md) { |
| 132 | + uuid <- xml2::xml_text(xml2::xml_find_first(md, ".//geonet:info/uuid")) |
| 133 | + id <- xml2::xml_text(xml2::xml_find_first(md, ".//geonet:info/id")) |
| 134 | + title <- xml2::xml_text(xml2::xml_find_first(md, ".//title")) |
| 135 | + abst <- xml2::xml_text(xml2::xml_find_first(md, ".//abstract")) |
| 136 | + logo <- xml2::xml_text(xml2::xml_find_first(md, ".//logo")) |
| 137 | + service_type <- xml2::xml_text(xml2::xml_find_first(md, ".//serviceType")) |
| 138 | + type_nodes <- xml2::xml_find_all(md, ".//type") |
| 139 | + types <- paste(xml2::xml_text(type_nodes), collapse = ";") |
| 140 | + |
| 141 | + link_nodes <- xml2::xml_find_all(md, ".//link") |
| 142 | + |
| 143 | + links_tbl <- if (length(link_nodes) == 0) { |
| 144 | + list(tibble::tibble( |
| 145 | + link_name = NA_character_, |
| 146 | + link_desc = NA_character_, |
| 147 | + link_url = NA_character_, |
| 148 | + link_protocol = NA_character_, |
| 149 | + link_mime = NA_character_, |
| 150 | + link_order = NA_character_ |
| 151 | + )) |
| 152 | + } else { |
| 153 | + list(purrr::map_dfr(link_nodes, ~parse_gn_link(xml2::xml_text(.x)))) |
| 154 | + } |
| 155 | + |
| 156 | + tibble::tibble( |
| 157 | + geonet_uuid = ifelse(uuid == "", NA_character_, uuid), |
| 158 | + geonet_id = ifelse(id == "", NA_character_, id), |
| 159 | + title = ifelse(title == "", NA_character_, title), |
| 160 | + abstract = ifelse(abst == "", NA_character_, abst), |
| 161 | + serviceType = ifelse(service_type == "", NA_character_, service_type), |
| 162 | + types = ifelse(types == "", NA_character_, types), |
| 163 | + source_logo = ifelse(logo == "", NA_character_, logo), |
| 164 | + links = links_tbl |
| 165 | + ) |
| 166 | + }) |
| 167 | +} |
0 commit comments