-
Notifications
You must be signed in to change notification settings - Fork 16
Expand file tree
/
Copy pathqueries.py
More file actions
93 lines (79 loc) · 3.24 KB
/
queries.py
File metadata and controls
93 lines (79 loc) · 3.24 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
"""
SPARQL Queries for Databus Python Client
This module contains SPARQL queries used for interacting with the DBpedia Databus.
"""
# Query to fetch ontologies with proper content variant aggregation
# Uses GROUP_CONCAT to handle multiple content variants per distribution
ONTOLOGIES_QUERY = """
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX databus: <https://databus.dbpedia.org/>
PREFIX dataid: <http://dataid.dbpedia.org/ns/core#>
PREFIX dataid-cv: <http://dataid.dbpedia.org/ns/cv#>
PREFIX dct: <http://purl.org/dc/terms/>
PREFIX dcat: <http://www.w3.org/ns/dcat#>
PREFIX xsd: <http://www.w3.org/2001/XMLSchema#>
SELECT DISTINCT
?group ?art ?version ?title ?publisher ?comment ?description
?license ?file ?extension ?type ?bytes ?shasum
(GROUP_CONCAT(DISTINCT ?variantStr; separator=", ") AS ?contentVariants)
WHERE {
?dataset dataid:account databus:ontologies .
?dataset dataid:group ?group .
?dataset dataid:artifact ?art.
?dataset dcat:distribution ?distribution .
?dataset dct:license ?license .
?dataset dct:publisher ?publisher .
?dataset rdfs:comment ?comment .
?dataset dct:description ?description .
?dataset dct:title ?title .
?distribution dcat:downloadURL ?file .
?distribution dataid:formatExtension ?extension .
?distribution dataid-cv:type ?type .
?distribution dcat:byteSize ?bytes .
?distribution dataid:sha256sum ?shasum .
?dataset dct:hasVersion ?version .
# Excludes dev versions
FILTER (!regex(?art, "--DEV"))
# OPTIONAL: Check for variants, but don't fail if none exist
OPTIONAL {
?distribution dataid:contentVariant ?cv .
BIND(STR(?cv) AS ?variantStr)
}
}
GROUP BY ?group ?art ?version ?title ?publisher ?comment ?description ?license ?file ?extension ?type ?bytes ?shasum
ORDER BY ?version
"""
def parse_content_variants_string(variants_str: str) -> dict:
"""
Parse a comma-separated content variants string from SPARQL GROUP_CONCAT result.
Parameters
----------
variants_str : str
Comma-separated string of content variants, e.g., "lang=en, type=full, sorted"
Returns
-------
dict
Dictionary of parsed content variants. For key=value pairs, both the key
and value are returned as strings (no type conversion is performed, so
"true" remains the string "true", not a boolean). For standalone values
without an "=" sign, the value is recorded as the boolean ``True``.
Example: "lang=en, type=full, sorted" -> {"lang": "en", "type": "full", "sorted": True}
Notes
-----
- All values from key=value pairs are kept as strings. If you need boolean
or numeric conversion, perform it after calling this function.
- Standalone items (e.g., "sorted") are stored with boolean ``True`` as
their value, indicating presence rather than a specific string value.
"""
if not variants_str or variants_str.strip() == "":
return {}
variants = {}
for part in variants_str.split(","):
part = part.strip()
if "=" in part:
key, value = part.split("=", 1)
variants[key.strip()] = value.strip()
elif part:
# Handle standalone values (no key=value format)
variants[part] = True
return variants