Skip to content

Commit d93b367

Browse files
committed
Merge pull request #2 from VertNet/feature/build-tables
Add Cascalog queries to build tax, occ, loc, tax_loc tables.
2 parents 62cdee4 + 5f7ce12 commit d93b367

File tree

5 files changed

+157
-62
lines changed

5 files changed

+157
-62
lines changed

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,3 +4,4 @@ pom.xml
44
/classes/
55
.lein-deps-sum
66
.lein-plugins
7+
rm-dwca-reader-clj-jars.sh

project.clj

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -11,8 +11,9 @@
1111
"-Xms1024M" "-Xmx1048M" "-server"]
1212
:plugins [[swank-clojure "1.4.0-SNAPSHOT"]]
1313
:dependencies [[org.clojure/clojure "1.4.0"]
14-
[cascalog "1.9.0-wip8"]
15-
[dwca-reader-clj "0.1.0-SNAPSHOT"]
14+
[cascalog "1.8.7"]
15+
[cascalog-more-taps-eighty "0.2.0"]
16+
[dwca-reader-clj "0.3.0-SNAPSHOT"]
1617
[cartodb-clj "1.0.0-SNAPSHOT"]]
1718
:dev-dependencies [[org.apache.hadoop/hadoop-core "0.20.2-dev"]
1819
[midje-cascalog "0.4.0"]

src/clj/gulo/core.clj

Lines changed: 99 additions & 55 deletions
Original file line numberDiff line numberDiff line change
@@ -1,66 +1,110 @@
11
(ns gulo.core
22
"This namespace downloads and harvests a set of Darwin Core Archives using
3-
Cascalog and unicorn magic."
4-
(:use [cascalog.api]
5-
[dwca.core]
6-
[cartodb.client :only (query)]
7-
[clojure.string :only (join split)])
8-
(:require [clojure.java.io :as io])
9-
(:import [org.gbif.dwc.record DarwinCoreRecord]
10-
[java.lang.reflect Field]
11-
[com.google.common.io Files]))
3+
Cascalog and unicorn magic."
4+
(:use [gulo.util :as util :only (latlon-valid? gen-uuid)]
5+
[cascalog.api]
6+
[clojure.contrib.string :as s :only (grep)]
7+
[cascalog.more-taps :as taps :only (hfs-delimited)]
8+
[dwca.core :as dwca]
9+
[cartodb.client :as cdb :only (query)]
10+
[clojure.string :only (join split lower-case)]))
1211

13-
(defn dwca-urls
14-
"Return collection of Darwin Core Archive URLs."
15-
[]
16-
(vec (map #(vals %) (query "vertnet" "SELECT dwca_url FROM publishers"))))
17-
18-
(defn archive-name
19-
"Return archive name from supplied URL as defined by the IPT."
20-
[url]
21-
(str "dwca-" (nth (split url #"=") 1)))
12+
;; ([?kingdom ?phylum ?class ?order ?family ?genus ?species ?sciname]
13+
;; (source :#> 183 {151 ?kingdom 152 ?phylum 153 ?class 154 ?order
14+
;; 155 ?family 156 ?genus 157 ?species 160 ?sciname}))]
2215

23-
(defn field-val
24-
"Return the string value of the supplied record field."
25-
[^Field field ^DarwinCoreRecord rec]
26-
{:pre [(instance? Field field)
27-
(instance? DarwinCoreRecord rec)]}
28-
(.setAccessible field true)
29-
(let [val (.get field rec)]
30-
(cond val (.trim val))))
16+
(defn taxon-location-table
17+
"Create taxon location table."
18+
[taxon location occurrence sink-path]
19+
(let [sink (taps/hfs-delimited sink-path :sinkmode :replace)]
20+
(?<- sink
21+
[?taxon-id ?loc-id ?occ-id]
22+
(taxon ?taxon-id ?name)
23+
(location ?loc-id ?lat ?lon)
24+
(occurrence :#> 183 {0 ?occ-id 22 ?lat 23 ?lon 160 ?name}))))
3125

32-
(defn rec->lines
33-
"Return a tab dilinated string of values in supplied DarwinCoreRecord object."
34-
[^ DarwinCoreRecord rec]
35-
{:pre [(instance? DarwinCoreRecord rec)]}
36-
(let [fields (->> rec .getClass .getDeclaredFields)
37-
values (map #(field-val % rec) fields)]
38-
(join "\t" values)))
26+
(defmapcatop explode-names
27+
"Emits all taxon names."
28+
[kingdom phylum class order family genus species sciname]
29+
(vec (map vector [kingdom phylum class order family genus species sciname])))
3930

40-
(defn grab
41-
"Download and expand a Darwin Core Archive at a URL and return a path to it."
42-
[url]
43-
(let [temp-dir (Files/createTempDir)
44-
temp-path (.getPath temp-dir)
45-
archive-name (archive-name url)
46-
zip-path (str temp-path "/" archive-name ".zip")
47-
archive-path (str temp-path "/" archive-name)]
48-
(download url zip-path)
49-
(unzip zip-path archive-path)
50-
archive-path))
51-
52-
(defmapcatop url->recs
53-
"Emit records as tab delineated lines from archive located at URL."
31+
(defn taxon-table
32+
"Create taxon table of unique names with generated UUIDs."
33+
[source sink-path]
34+
(let [sink (taps/hfs-delimited sink-path :sinkmode :replace)
35+
unique-names (<- [?name]
36+
(source :#> 183 {151 ?kingdom 152 ?phylum 153 ?class
37+
154 ?order 155 ?family 156 ?genus
38+
157 ?species 160 ?sciname})
39+
(explode-names ?kingdom ?phylum ?class ?order ?family
40+
?genus ?species ?sciname :> ?name))]
41+
(?<- sink
42+
[?uuid ?name]
43+
(unique-names ?name)
44+
(util/gen-uuid :> ?uuid))))
45+
46+
(defn location-table
47+
"Create location table of unique and valid lat/lon with generated UUIDs."
48+
[source sink-path]
49+
(let [sink (taps/hfs-delimited sink-path :sinkmode :replace)
50+
unique-latlons (<- [?lat ?lon]
51+
(source :#> 183 {22 ?lat 23 ?lon})
52+
(util/latlon-valid? ?lat ?lon))]
53+
(?<- sink
54+
[?uuid ?lat ?lon]
55+
(unique-latlons ?lat ?lon)
56+
(util/gen-uuid :> ?uuid))))
57+
58+
(defn explode
59+
[rec]
60+
(vec (cons (util/gen-uuid) (field-vals rec))))
61+
62+
(defmapcatop explode-lines
63+
"Emit records as tab delineated lines from archive located at URL. A UUID is
64+
prepended to each line for use by Cascalog joins when building other tables."
5465
[url]
55-
(for [rec (get-records (grab url))]
56-
[(rec->lines rec)]))
66+
(vec (map explode (dwca/open url))))
5767

58-
(defn harvest
68+
(defn occurrence-table
5969
"Download and store records from many Darwin Core Archive URLs to CSV file."
60-
[sink-path]
61-
(let [source (dwca-urls)
62-
sink (hfs-delimited sink-path :delimiter "\t" :sinkmode :replace)]
70+
[source sink-path]
71+
(let [sink (taps/hfs-delimited sink-path :sinkmode :replace)]
6372
(?<- sink
64-
[?line]
73+
[?0 ?1 ?2 ?3 ?4 ?5 ?6 ?7 ?8 ?9 ?10 ?11 ?12 ?13 ?14 ?15 ?16 ?17 ?18 ?19
74+
?20 ?21 ?22 ?23 ?24 ?25 ?26 ?27 ?28 ?29 ?30 ?31 ?32 ?33 ?34 ?35 ?36
75+
?37 ?38 ?39 ?40 ?41 ?42 ?43 ?44 ?45 ?46 ?47 ?48 ?49 ?50 ?51 ?52 ?53
76+
?54 ?55 ?56 ?57 ?58 ?59 ?60 ?61 ?62 ?63 ?64 ?65 ?66 ?67 ?68 ?69 ?70
77+
?71 ?72 ?73 ?74 ?75 ?76 ?77 ?78 ?79 ?80 ?81 ?82 ?83 ?84 ?85 ?86 ?87
78+
?88 ?89 ?90 ?91 ?92 ?93 ?94 ?95 ?96 ?97 ?98 ?99 ?100 ?101 ?102 ?103
79+
?104 ?105 ?106 ?107 ?108 ?109 ?110 ?111 ?112 ?113 ?114 ?115 ?116 ?117
80+
?118 ?119 ?120 ?121 ?122 ?123 ?124 ?125 ?126 ?127 ?128 ?129 ?130 ?131
81+
?132 ?133 ?134 ?135 ?136 ?137 ?138 ?139 ?140 ?141 ?142 ?143 ?144 ?145
82+
?146 ?147 ?148 ?149 ?150 ?151 ?152 ?153 ?154 ?155 ?156 ?157 ?158 ?159
83+
?160 ?161 ?162 ?163 ?164 ?165 ?166 ?167 ?168 ?169 ?170 ?171 ?172 ?173
84+
?174 ?175 ?176 ?177 ?178 ?179 ?180 ?181 ?182]
6585
(source ?url)
66-
(url->recs ?url :> ?line))))
86+
(explode-lines ?url :> ?0 ?1 ?2 ?3 ?4 ?5 ?6 ?7 ?8 ?9 ?10 ?11 ?12 ?13
87+
?14 ?15 ?16 ?17 ?18 ?19 ?20 ?21 ?22 ?23 ?24 ?25 ?26 ?27
88+
?28 ?29 ?30 ?31 ?32 ?33 ?34 ?35 ?36 ?37 ?38 ?39 ?40 ?41
89+
?42 ?43 ?44 ?45 ?46 ?47 ?48 ?49 ?50 ?51 ?52 ?53 ?54 ?55
90+
?56 ?57 ?58 ?59 ?60 ?61 ?62 ?63 ?64 ?65 ?66 ?67 ?68 ?69
91+
?70 ?71 ?72 ?73 ?74 ?75 ?76 ?77 ?78 ?79 ?80 ?81 ?82 ?83
92+
?84 ?85 ?86 ?87 ?88 ?89 ?90 ?91 ?92 ?93 ?94 ?95 ?96 ?97
93+
?98 ?99 ?100 ?101 ?102 ?103 ?104 ?105 ?106 ?107 ?108 ?109
94+
?110 ?111 ?112 ?113 ?114 ?115 ?116 ?117 ?118 ?119 ?120
95+
?121 ?122 ?123 ?124 ?125 ?126 ?127 ?128 ?129 ?130 ?131
96+
?132 ?133 ?134 ?135 ?136 ?137 ?138 ?139 ?140 ?141 ?142
97+
?143 ?144 ?145 ?146 ?147 ?148 ?149 ?150 ?151 ?152 ?153
98+
?154 ?155 ?156 ?157 ?158 ?159 ?160 ?161 ?162 ?163 ?164
99+
?165 ?166 ?167 ?168 ?169 ?170 ?171 ?172 ?173 ?174 ?175
100+
?176 ?177 ?178 ?179 ?180 ?181 ?182))))
101+
102+
(defn harvest
103+
[source occ-path loc-path taxon-path taxon-loc-path ]
104+
(occurrence-table source occ-path)
105+
(let [occ-source (taps/hfs-delimited occ-path :sinkmode :replace)]
106+
(location-table occ-source loc-path)
107+
(taxon-table occ-source taxon-path)
108+
(taxon-location-table (taps/hfs-delimited taxon-path :sinkmode :replace)
109+
(taps/hfs-delimited loc-path :sinkmode :replace)
110+
occ-source taxon-loc-path)))

src/clj/gulo/util.clj

Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,42 @@
1+
(ns gulo.util
2+
"This namespace contains utility functions."
3+
(:use [cartodb.client :as cdb :only (query)]))
4+
5+
(defn dwca-urls
6+
"Return collection of Darwin Core Archive URLs."
7+
[]
8+
(vec (map #(vals %) (cdb/query "vertnet" "SELECT dwca_url FROM publishers"))))
9+
10+
(defn gen-uuid
11+
"Return a randomly generated UUID string."
12+
[& x] ;; Cascalog ArityException: Wrong number of args without [& x]
13+
(str (java.util.UUID/randomUUID)))
14+
15+
;; Valid ranges for latitude and longitude.
16+
(def latlon-range {:lat-min -90 :lat-max 90 :lon-min -180 :lon-max 180})
17+
18+
(defn read-latlon
19+
"Converts lat and lon values from string to number."
20+
[lat lon]
21+
{:pre [(instance? java.lang.String lat)
22+
(instance? java.lang.String lon)]}
23+
[(read-string lat) (read-string lon)])
24+
25+
(defn latlon-valid?
26+
"Return true if lat and lon are valid, otherwise return false."
27+
[lat lon]
28+
(try
29+
(let [{:keys [lat-min lat-max lon-min lon-max]} latlon-range
30+
[lat lon] (read-latlon lat lon)]
31+
(and (<= lat lat-max)
32+
(>= lat lat-min)
33+
(<= lon lon-max)
34+
(>= lon lon-min)))
35+
(catch Exception e false)))
36+
37+
(defn occurrence-table-header
38+
"Return the occurrence table header."
39+
[]
40+
(join " "
41+
(for [x (field-keys rec)]
42+
(symbol (clojure.string/replace (lower-case (str x)) ":" "")))))

test/gulo/core_test.clj

Lines changed: 12 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,14 @@
11
(ns gulo.core-test
2-
(:use clojure.test
3-
gulo.core))
2+
(:use gulo.core
3+
[midje sweet]
4+
[clojure.string :only (split)])
5+
(:import [com.google.common.io Files]))
46

5-
(deftest a-test
6-
(testing "FIXME, I fail."
7-
(is (= 0 1))))
7+
(fact
8+
"Check harvesting."
9+
(let [source [["http://vertnet.nhm.ku.edu:8080/ipt/archive.do?r=ttrs_mammals"]]
10+
temp-dir (Files/createTempDir)
11+
sink-path (.getPath temp-dir)]
12+
(harvest source sink-path)
13+
(println sink-path)
14+
(count (split (slurp (str sink-path "/part-00000")) #"\n")) => 968))

0 commit comments

Comments
 (0)