|
18 | 18 | package org.apache.beam.sdk.extensions.gcp.util; |
19 | 19 |
|
20 | 20 | import static org.apache.beam.sdk.io.FileSystemUtils.wildcardToRegexp; |
| 21 | +import static org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.Preconditions.checkArgument; |
21 | 22 | import static org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.Preconditions.checkNotNull; |
22 | 23 |
|
23 | 24 | import com.google.api.gax.paging.Page; |
24 | 25 | import com.google.auto.value.AutoValue; |
25 | 26 | import com.google.cloud.storage.Blob; |
| 27 | +import com.google.cloud.storage.BlobId; |
| 28 | +import com.google.cloud.storage.BlobInfo; |
26 | 29 | import com.google.cloud.storage.Bucket; |
27 | 30 | import com.google.cloud.storage.BucketInfo; |
| 31 | +import com.google.cloud.storage.CopyWriter; |
28 | 32 | import com.google.cloud.storage.Storage; |
29 | 33 | import com.google.cloud.storage.Storage.BlobField; |
30 | 34 | import com.google.cloud.storage.Storage.BlobGetOption; |
31 | 35 | import com.google.cloud.storage.Storage.BlobListOption; |
32 | 36 | import com.google.cloud.storage.Storage.BucketField; |
33 | 37 | import com.google.cloud.storage.Storage.BucketGetOption; |
| 38 | +import com.google.cloud.storage.Storage.CopyRequest; |
34 | 39 | import com.google.cloud.storage.StorageBatch; |
35 | 40 | import com.google.cloud.storage.StorageBatchResult; |
36 | 41 | import com.google.cloud.storage.StorageException; |
@@ -71,18 +76,27 @@ public GcsUtilV2 create(PipelineOptions options) { |
71 | 76 | /** Maximum number of requests permitted in a GCS batch request. */ |
72 | 77 | private static final int MAX_REQUESTS_PER_BATCH = 100; |
73 | 78 |
|
| 79 | + /** |
| 80 | + * Limit the number of bytes Cloud Storage will attempt to copy before responding to an individual |
| 81 | + * request. If you see Read Timeout errors, try reducing this value. |
| 82 | + */ |
| 83 | + private static final long MEGABYTES_COPIED_PER_CHUNK = 2048L; |
| 84 | + |
74 | 85 | GcsUtilV2(PipelineOptions options) { |
75 | 86 | String projectId = options.as(GcpOptions.class).getProject(); |
76 | 87 | storage = StorageOptions.newBuilder().setProjectId(projectId).build().getService(); |
77 | 88 | } |
78 | 89 |
|
79 | 90 | @SuppressWarnings({ |
80 | | - "nullness" // For Creating AccessDeniedException and FileAlreadyExistsException with null. |
| 91 | + "nullness" // For Creating AccessDeniedException FileNotFoundException, and |
| 92 | + // FileAlreadyExistsException with null. |
81 | 93 | }) |
82 | 94 | private IOException translateStorageException(GcsPath gcsPath, StorageException e) { |
83 | 95 | switch (e.getCode()) { |
84 | 96 | case 403: |
85 | 97 | return new AccessDeniedException(gcsPath.toString(), null, e.getMessage()); |
| 98 | + case 404: |
| 99 | + return new FileNotFoundException(e.getMessage()); |
86 | 100 | case 409: |
87 | 101 | return new FileAlreadyExistsException(gcsPath.toString(), null, e.getMessage()); |
88 | 102 | default: |
@@ -259,6 +273,151 @@ public List<GcsPath> expand(GcsPath gcsPattern) throws IOException { |
259 | 273 | return results; |
260 | 274 | } |
261 | 275 |
|
| 276 | + public enum MissingStrategy { |
| 277 | + FAIL_IF_MISSING, |
| 278 | + SKIP_IF_MISSING, |
| 279 | + } |
| 280 | + |
| 281 | + public void remove(Iterable<GcsPath> paths, MissingStrategy strategy) throws IOException { |
| 282 | + for (List<GcsPath> pathPartition : |
| 283 | + Lists.partition(Lists.newArrayList(paths), MAX_REQUESTS_PER_BATCH)) { |
| 284 | + |
| 285 | + // Create a new empty batch every time |
| 286 | + StorageBatch batch = storage.batch(); |
| 287 | + List<StorageBatchResult<Boolean>> batchResultFutures = new ArrayList<>(); |
| 288 | + |
| 289 | + for (GcsPath path : pathPartition) { |
| 290 | + batchResultFutures.add(batch.delete(path.getBucket(), path.getObject())); |
| 291 | + } |
| 292 | + batch.submit(); |
| 293 | + |
| 294 | + for (int i = 0; i < batchResultFutures.size(); i++) { |
| 295 | + StorageBatchResult<Boolean> future = batchResultFutures.get(i); |
| 296 | + try { |
| 297 | + Boolean deleted = future.get(); |
| 298 | + if (!deleted) { |
| 299 | + if (strategy == MissingStrategy.FAIL_IF_MISSING) { |
| 300 | + throw new FileNotFoundException( |
| 301 | + String.format( |
| 302 | + "The specified file does not exist: %s", pathPartition.get(i).toString())); |
| 303 | + } else { |
| 304 | + LOG.warn("Ignoring failed deletion on file {}.", pathPartition.get(i).toString()); |
| 305 | + } |
| 306 | + } |
| 307 | + } catch (StorageException e) { |
| 308 | + throw translateStorageException(pathPartition.get(i), e); |
| 309 | + } |
| 310 | + } |
| 311 | + } |
| 312 | + } |
| 313 | + |
| 314 | + public enum OverwriteStrategy { |
| 315 | + FAIL_IF_EXISTS, // Fail if target exists |
| 316 | + SKIP_IF_EXISTS, // Skip if target exists |
| 317 | + SAFE_OVERWRITE, // Overwrite only if the generation matches (atomic) |
| 318 | + ALWAYS_OVERWRITE // Overwrite regardless of state |
| 319 | + } |
| 320 | + |
| 321 | + private void rewriteHelper( |
| 322 | + Iterable<GcsPath> srcPaths, |
| 323 | + Iterable<GcsPath> dstPaths, |
| 324 | + boolean deleteSrc, |
| 325 | + MissingStrategy srcMissing, |
| 326 | + OverwriteStrategy dstOverwrite) |
| 327 | + throws IOException { |
| 328 | + List<GcsPath> srcList = Lists.newArrayList(srcPaths); |
| 329 | + List<GcsPath> dstList = Lists.newArrayList(dstPaths); |
| 330 | + checkArgument( |
| 331 | + srcList.size() == dstList.size(), |
| 332 | + "Number of source files %s must equal number of destination files %s", |
| 333 | + srcList.size(), |
| 334 | + dstList.size()); |
| 335 | + |
| 336 | + for (int i = 0; i < srcList.size(); i++) { |
| 337 | + GcsPath srcPath = srcList.get(i); |
| 338 | + GcsPath dstPath = dstList.get(i); |
| 339 | + BlobId srcId = BlobId.of(srcPath.getBucket(), srcPath.getObject()); |
| 340 | + BlobId dstId = BlobId.of(dstPath.getBucket(), dstPath.getObject()); |
| 341 | + |
| 342 | + CopyRequest.Builder copyRequestBuilder = |
| 343 | + CopyRequest.newBuilder() |
| 344 | + .setSource(srcId) |
| 345 | + .setMegabytesCopiedPerChunk(MEGABYTES_COPIED_PER_CHUNK); |
| 346 | + |
| 347 | + if (dstOverwrite == OverwriteStrategy.ALWAYS_OVERWRITE) { |
| 348 | + copyRequestBuilder.setTarget(dstId); |
| 349 | + } else { |
| 350 | + // FAIL_IF_EXISTS, SKIP_IF_EXISTS and SAFE_OVERWRITE require checking the target blob |
| 351 | + BlobInfo existingTarget; |
| 352 | + try { |
| 353 | + existingTarget = storage.get(dstId); |
| 354 | + } catch (StorageException e) { |
| 355 | + throw translateStorageException(dstPath, e); |
| 356 | + } |
| 357 | + |
| 358 | + if (existingTarget == null) { |
| 359 | + copyRequestBuilder.setTarget(dstId, Storage.BlobTargetOption.doesNotExist()); |
| 360 | + } else { |
| 361 | + switch (dstOverwrite) { |
| 362 | + case SKIP_IF_EXISTS: |
| 363 | + LOG.warn("Ignoring rewriting from {} to {} because target exists.", srcPath, dstPath); |
| 364 | + continue; // Skip to next file in for-loop |
| 365 | + |
| 366 | + case SAFE_OVERWRITE: |
| 367 | + copyRequestBuilder.setTarget( |
| 368 | + dstId, Storage.BlobTargetOption.generationMatch(existingTarget.getGeneration())); |
| 369 | + break; |
| 370 | + |
| 371 | + case FAIL_IF_EXISTS: |
| 372 | + throw new FileAlreadyExistsException( |
| 373 | + srcPath.toString(), |
| 374 | + dstPath.toString(), |
| 375 | + "Target object already exists and strategy is FAIL_IF_EXISTS"); |
| 376 | + default: |
| 377 | + throw new IllegalStateException("Unknown OverwriteStrategy: " + dstOverwrite); |
| 378 | + } |
| 379 | + } |
| 380 | + } |
| 381 | + |
| 382 | + try { |
| 383 | + CopyWriter copyWriter = storage.copy(copyRequestBuilder.build()); |
| 384 | + copyWriter.getResult(); |
| 385 | + |
| 386 | + if (deleteSrc) { |
| 387 | + if (!storage.delete(srcId)) { |
| 388 | + // This may happen if the source file is deleted by another process after copy. |
| 389 | + LOG.warn( |
| 390 | + "Source file {} could not be deleted after move to {}. It may not have existed.", |
| 391 | + srcPath, |
| 392 | + dstPath); |
| 393 | + } |
| 394 | + } |
| 395 | + } catch (StorageException e) { |
| 396 | + if (e.getCode() == 404 && srcMissing == MissingStrategy.SKIP_IF_MISSING) { |
| 397 | + LOG.warn( |
| 398 | + "Ignoring rewriting from {} to {} because source does not exist.", srcPath, dstPath); |
| 399 | + continue; |
| 400 | + } |
| 401 | + throw translateStorageException(srcPath, e); |
| 402 | + } |
| 403 | + } |
| 404 | + } |
| 405 | + |
| 406 | + public void copy( |
| 407 | + Iterable<GcsPath> srcPaths, Iterable<GcsPath> dstPaths, OverwriteStrategy strategy) |
| 408 | + throws IOException { |
| 409 | + rewriteHelper(srcPaths, dstPaths, false, MissingStrategy.FAIL_IF_MISSING, strategy); |
| 410 | + } |
| 411 | + |
| 412 | + public void move( |
| 413 | + Iterable<GcsPath> srcPaths, |
| 414 | + Iterable<GcsPath> dstPaths, |
| 415 | + MissingStrategy srcMissing, |
| 416 | + OverwriteStrategy dstOverwrite) |
| 417 | + throws IOException { |
| 418 | + rewriteHelper(srcPaths, dstPaths, true, srcMissing, dstOverwrite); |
| 419 | + } |
| 420 | + |
262 | 421 | /** Get the {@link Bucket} from Cloud Storage path or propagates an exception. */ |
263 | 422 | public Bucket getBucket(GcsPath path, BucketGetOption... options) throws IOException { |
264 | 423 | String bucketName = path.getBucket(); |
|
0 commit comments