|
1 | 1 | (ns gulo.core |
2 | 2 | "This namespace downloads and harvests a set of Darwin Core Archives using |
3 | | - Cascalog and unicorn magic." |
4 | | - (:use [cascalog.api] |
5 | | - [dwca.core] |
6 | | - [cartodb.client :only (query)] |
7 | | - [clojure.string :only (join split)]) |
8 | | - (:require [clojure.java.io :as io]) |
9 | | - (:import [org.gbif.dwc.record DarwinCoreRecord] |
10 | | - [java.lang.reflect Field] |
11 | | - [com.google.common.io Files])) |
| 3 | + Cascalog and unicorn magic." |
| 4 | + (:use [gulo.util :as util :only (latlon-valid? gen-uuid)] |
| 5 | + [cascalog.api] |
| 6 | + [clojure.contrib.string :as s :only (grep)] |
| 7 | + [cascalog.more-taps :as taps :only (hfs-delimited)] |
| 8 | + [dwca.core :as dwca] |
| 9 | + [cartodb.client :as cdb :only (query)] |
| 10 | + [clojure.string :only (join split lower-case)])) |
12 | 11 |
|
13 | | -(defn dwca-urls |
14 | | - "Return collection of Darwin Core Archive URLs." |
15 | | - [] |
16 | | -(vec (map #(vals %) (query "vertnet" "SELECT dwca_url FROM publishers")))) |
17 | | - |
18 | | -(defn archive-name |
19 | | - "Return archive name from supplied URL as defined by the IPT." |
20 | | - [url] |
21 | | - (str "dwca-" (nth (split url #"=") 1))) |
| 12 | +;; ([?kingdom ?phylum ?class ?order ?family ?genus ?species ?sciname] |
| 13 | +;; (source :#> 183 {151 ?kingdom 152 ?phylum 153 ?class 154 ?order |
| 14 | +;; 155 ?family 156 ?genus 157 ?species 160 ?sciname}))] |
22 | 15 |
|
23 | | -(defn field-val |
24 | | - "Return the string value of the supplied record field." |
25 | | - [^Field field ^DarwinCoreRecord rec] |
26 | | - {:pre [(instance? Field field) |
27 | | - (instance? DarwinCoreRecord rec)]} |
28 | | - (.setAccessible field true) |
29 | | - (let [val (.get field rec)] |
30 | | - (cond val (.trim val)))) |
| 16 | +(defn taxon-location-table |
| 17 | + "Create taxon location table." |
| 18 | + [taxon location occurrence sink-path] |
| 19 | + (let [sink (taps/hfs-delimited sink-path :sinkmode :replace)] |
| 20 | + (?<- sink |
| 21 | + [?taxon-id ?loc-id ?occ-id] |
| 22 | + (taxon ?taxon-id ?name) |
| 23 | + (location ?loc-id ?lat ?lon) |
| 24 | + (occurrence :#> 183 {0 ?occ-id 22 ?lat 23 ?lon 160 ?name})))) |
31 | 25 |
|
32 | | -(defn rec->lines |
33 | | - "Return a tab dilinated string of values in supplied DarwinCoreRecord object." |
34 | | - [^ DarwinCoreRecord rec] |
35 | | - {:pre [(instance? DarwinCoreRecord rec)]} |
36 | | - (let [fields (->> rec .getClass .getDeclaredFields) |
37 | | - values (map #(field-val % rec) fields)] |
38 | | - (join "\t" values))) |
| 26 | +(defmapcatop explode-names |
| 27 | + "Emits all taxon names." |
| 28 | + [kingdom phylum class order family genus species sciname] |
| 29 | + (vec (map vector [kingdom phylum class order family genus species sciname]))) |
39 | 30 |
|
40 | | -(defn grab |
41 | | - "Download and expand a Darwin Core Archive at a URL and return a path to it." |
42 | | - [url] |
43 | | - (let [temp-dir (Files/createTempDir) |
44 | | - temp-path (.getPath temp-dir) |
45 | | - archive-name (archive-name url) |
46 | | - zip-path (str temp-path "/" archive-name ".zip") |
47 | | - archive-path (str temp-path "/" archive-name)] |
48 | | - (download url zip-path) |
49 | | - (unzip zip-path archive-path) |
50 | | - archive-path)) |
51 | | - |
52 | | -(defmapcatop url->recs |
53 | | - "Emit records as tab delineated lines from archive located at URL." |
| 31 | +(defn taxon-table |
| 32 | + "Create taxon table of unique names with generated UUIDs." |
| 33 | + [source sink-path] |
| 34 | + (let [sink (taps/hfs-delimited sink-path :sinkmode :replace) |
| 35 | + unique-names (<- [?name] |
| 36 | + (source :#> 183 {151 ?kingdom 152 ?phylum 153 ?class |
| 37 | + 154 ?order 155 ?family 156 ?genus |
| 38 | + 157 ?species 160 ?sciname}) |
| 39 | + (explode-names ?kingdom ?phylum ?class ?order ?family |
| 40 | + ?genus ?species ?sciname :> ?name))] |
| 41 | + (?<- sink |
| 42 | + [?uuid ?name] |
| 43 | + (unique-names ?name) |
| 44 | + (util/gen-uuid :> ?uuid)))) |
| 45 | + |
| 46 | +(defn location-table |
| 47 | + "Create location table of unique and valid lat/lon with generated UUIDs." |
| 48 | + [source sink-path] |
| 49 | + (let [sink (taps/hfs-delimited sink-path :sinkmode :replace) |
| 50 | + unique-latlons (<- [?lat ?lon] |
| 51 | + (source :#> 183 {22 ?lat 23 ?lon}) |
| 52 | + (util/latlon-valid? ?lat ?lon))] |
| 53 | + (?<- sink |
| 54 | + [?uuid ?lat ?lon] |
| 55 | + (unique-latlons ?lat ?lon) |
| 56 | + (util/gen-uuid :> ?uuid)))) |
| 57 | + |
| 58 | +(defn explode |
| 59 | + [rec] |
| 60 | + (vec (cons (util/gen-uuid) (field-vals rec)))) |
| 61 | + |
| 62 | +(defmapcatop explode-lines |
| 63 | + "Emit records as tab delineated lines from archive located at URL. A UUID is |
| 64 | + prepended to each line for use by Cascalog joins when building other tables." |
54 | 65 | [url] |
55 | | - (for [rec (get-records (grab url))] |
56 | | - [(rec->lines rec)])) |
| 66 | + (vec (map explode (dwca/open url)))) |
57 | 67 |
|
58 | | -(defn harvest |
| 68 | +(defn occurrence-table |
59 | 69 | "Download and store records from many Darwin Core Archive URLs to CSV file." |
60 | | - [sink-path] |
61 | | - (let [source (dwca-urls) |
62 | | - sink (hfs-delimited sink-path :delimiter "\t" :sinkmode :replace)] |
| 70 | + [source sink-path] |
| 71 | + (let [sink (taps/hfs-delimited sink-path :sinkmode :replace)] |
63 | 72 | (?<- sink |
64 | | - [?line] |
| 73 | + [?0 ?1 ?2 ?3 ?4 ?5 ?6 ?7 ?8 ?9 ?10 ?11 ?12 ?13 ?14 ?15 ?16 ?17 ?18 ?19 |
| 74 | + ?20 ?21 ?22 ?23 ?24 ?25 ?26 ?27 ?28 ?29 ?30 ?31 ?32 ?33 ?34 ?35 ?36 |
| 75 | + ?37 ?38 ?39 ?40 ?41 ?42 ?43 ?44 ?45 ?46 ?47 ?48 ?49 ?50 ?51 ?52 ?53 |
| 76 | + ?54 ?55 ?56 ?57 ?58 ?59 ?60 ?61 ?62 ?63 ?64 ?65 ?66 ?67 ?68 ?69 ?70 |
| 77 | + ?71 ?72 ?73 ?74 ?75 ?76 ?77 ?78 ?79 ?80 ?81 ?82 ?83 ?84 ?85 ?86 ?87 |
| 78 | + ?88 ?89 ?90 ?91 ?92 ?93 ?94 ?95 ?96 ?97 ?98 ?99 ?100 ?101 ?102 ?103 |
| 79 | + ?104 ?105 ?106 ?107 ?108 ?109 ?110 ?111 ?112 ?113 ?114 ?115 ?116 ?117 |
| 80 | + ?118 ?119 ?120 ?121 ?122 ?123 ?124 ?125 ?126 ?127 ?128 ?129 ?130 ?131 |
| 81 | + ?132 ?133 ?134 ?135 ?136 ?137 ?138 ?139 ?140 ?141 ?142 ?143 ?144 ?145 |
| 82 | + ?146 ?147 ?148 ?149 ?150 ?151 ?152 ?153 ?154 ?155 ?156 ?157 ?158 ?159 |
| 83 | + ?160 ?161 ?162 ?163 ?164 ?165 ?166 ?167 ?168 ?169 ?170 ?171 ?172 ?173 |
| 84 | + ?174 ?175 ?176 ?177 ?178 ?179 ?180 ?181 ?182] |
65 | 85 | (source ?url) |
66 | | - (url->recs ?url :> ?line)))) |
| 86 | + (explode-lines ?url :> ?0 ?1 ?2 ?3 ?4 ?5 ?6 ?7 ?8 ?9 ?10 ?11 ?12 ?13 |
| 87 | + ?14 ?15 ?16 ?17 ?18 ?19 ?20 ?21 ?22 ?23 ?24 ?25 ?26 ?27 |
| 88 | + ?28 ?29 ?30 ?31 ?32 ?33 ?34 ?35 ?36 ?37 ?38 ?39 ?40 ?41 |
| 89 | + ?42 ?43 ?44 ?45 ?46 ?47 ?48 ?49 ?50 ?51 ?52 ?53 ?54 ?55 |
| 90 | + ?56 ?57 ?58 ?59 ?60 ?61 ?62 ?63 ?64 ?65 ?66 ?67 ?68 ?69 |
| 91 | + ?70 ?71 ?72 ?73 ?74 ?75 ?76 ?77 ?78 ?79 ?80 ?81 ?82 ?83 |
| 92 | + ?84 ?85 ?86 ?87 ?88 ?89 ?90 ?91 ?92 ?93 ?94 ?95 ?96 ?97 |
| 93 | + ?98 ?99 ?100 ?101 ?102 ?103 ?104 ?105 ?106 ?107 ?108 ?109 |
| 94 | + ?110 ?111 ?112 ?113 ?114 ?115 ?116 ?117 ?118 ?119 ?120 |
| 95 | + ?121 ?122 ?123 ?124 ?125 ?126 ?127 ?128 ?129 ?130 ?131 |
| 96 | + ?132 ?133 ?134 ?135 ?136 ?137 ?138 ?139 ?140 ?141 ?142 |
| 97 | + ?143 ?144 ?145 ?146 ?147 ?148 ?149 ?150 ?151 ?152 ?153 |
| 98 | + ?154 ?155 ?156 ?157 ?158 ?159 ?160 ?161 ?162 ?163 ?164 |
| 99 | + ?165 ?166 ?167 ?168 ?169 ?170 ?171 ?172 ?173 ?174 ?175 |
| 100 | + ?176 ?177 ?178 ?179 ?180 ?181 ?182)))) |
| 101 | + |
| 102 | +(defn harvest |
| 103 | + [source occ-path loc-path taxon-path taxon-loc-path ] |
| 104 | + (occurrence-table source occ-path) |
| 105 | + (let [occ-source (taps/hfs-delimited occ-path :sinkmode :replace)] |
| 106 | + (location-table occ-source loc-path) |
| 107 | + (taxon-table occ-source taxon-path) |
| 108 | + (taxon-location-table (taps/hfs-delimited taxon-path :sinkmode :replace) |
| 109 | + (taps/hfs-delimited loc-path :sinkmode :replace) |
| 110 | + occ-source taxon-loc-path))) |
0 commit comments