33
44module OAI
55 module Harvester
6-
76 class Harvest
7+ DIRECTORY_LAYOUT = "%Y/%m" . freeze
88
99 def initialize ( config = nil , directory = nil , date = nil )
1010 @config = config || Config . load
@@ -43,22 +43,27 @@ def harvest(site)
4343 # Allow a from date to be passed in
4444 opts [ :from ] = earliest ( opts [ :url ] ) unless opts [ :from ]
4545 opts . delete ( :set ) if 'all' == opts [ :set ]
46-
4746 begin
4847 # Connect, and download
4948 file , records = call ( opts . delete ( :url ) , opts )
5049
51- # Move document to storage directory
52- dir = File . join ( @directory , date_based_directory ( harvest_time ) )
53- FileUtils . mkdir_p dir
54- FileUtils . mv ( file . path ,
55- File . join ( dir , "#{ site } -#{ filename ( Time . parse ( opts [ :from ] ) ,
56- harvest_time ) } .xml.gz") )
50+ # Move document to storage directory if configured
51+ if @directory
52+ directory_layout = @config . layouts [ site ] if @config . layouts
53+ dir = File . join ( @directory , date_based_directory ( harvest_time , directory_layout ) )
54+ FileUtils . mkdir_p dir
55+ FileUtils . mv ( file . path ,
56+ File . join ( dir , "#{ site } -#{ filename ( Time . parse ( opts [ :from ] ) ,
57+ harvest_time ) } .xml.gz") )
58+ else
59+ puts "no configured destination for temp file" if @interactive
60+ end
5761 @config . sites [ site ] [ 'last' ] = harvest_time
58- rescue
59- raise $! unless $!. respond_to? ( :code )
60- raise $! if not @interactive || "noRecordsMatch" != $!. code
61- puts "No new records available"
62+ rescue OAI ::NoMatchException
63+ puts "No new records available" if @interactive
64+ rescue OAI ::Exception => ex
65+ raise ex if not @interactive
66+ puts ex . message
6267 end
6368 end
6469
@@ -69,15 +74,15 @@ def call(url, opts)
6974 records = 0 ;
7075 client = OAI ::Client . new ( url , :parser => @parser )
7176 provider_config = client . identify
72-
77+
7378 file = Tempfile . new ( 'oai_data' )
7479 gz = Zlib ::GzipWriter . new ( file )
7580 gz << "<?xml version=\" 1.0\" encoding=\" UTF-8\" ?>\n "
7681 gz << "<records>"
7782 begin
7883 response = client . list_records ( options )
79- get_records ( response . doc ) . each do |rec |
80- gz << rec
84+ response . each do |rec |
85+ gz << rec . _source
8186 records += 1
8287 end
8388 puts "#{ records } records retrieved" if @interactive
@@ -89,8 +94,8 @@ def call(url, opts)
8994 puts "\n resumption token recieved, continuing" if @interactive
9095 response = client . list_records ( :resumption_token =>
9196 response . resumption_token )
92- get_records ( response . doc ) . each do |rec |
93- gz << rec
97+ response . each do |rec |
98+ gz << rec . _source
9499 records += 1
95100 end
96101 puts "#{ records } records retrieved" if @interactive
@@ -118,8 +123,9 @@ def build_options_hash(site)
118123 options
119124 end
120125
121- def date_based_directory ( time )
122- "#{ time . strftime ( DIRECTORY_LAYOUT ) } "
126+ def date_based_directory ( time , directory_layout = nil )
127+ directory_layout ||= Harvest ::DIRECTORY_LAYOUT
128+ "#{ time . strftime ( directory_layout ) } "
123129 end
124130
125131 def filename ( from_time , until_time )
0 commit comments