From 072d1b8e041fb9ccb72b3627df0fb38bcfa3bda9 Mon Sep 17 00:00:00 2001 From: Bartek Ciszkowski Date: Thu, 28 Nov 2024 15:50:22 -0400 Subject: [PATCH 1/3] Add GoogleFetcher This allows the fetching of items using files.get from Google Drive --- tika-grpc/pom.xml | 6 + tika-pipes/tika-fetchers/pom.xml | 3 +- .../tika-fetchers/tika-fetcher-google/pom.xml | 105 +++++++++ .../fetchers/google/GoogleDriveFetcher.java | 200 ++++++++++++++++++ .../config/GoogleDriveFetcherConfig.java | 78 +++++++ 5 files changed, 391 insertions(+), 1 deletion(-) create mode 100644 tika-pipes/tika-fetchers/tika-fetcher-google/pom.xml create mode 100644 tika-pipes/tika-fetchers/tika-fetcher-google/src/main/java/org/apache/tika/pipes/fetchers/google/GoogleDriveFetcher.java create mode 100644 tika-pipes/tika-fetchers/tika-fetcher-google/src/main/java/org/apache/tika/pipes/fetchers/google/config/GoogleDriveFetcherConfig.java diff --git a/tika-grpc/pom.xml b/tika-grpc/pom.xml index 5603a62b007..68b90cd17ed 100644 --- a/tika-grpc/pom.xml +++ b/tika-grpc/pom.xml @@ -223,6 +223,12 @@ tika-fetcher-http ${project.version} + + + org.apache.tika + tika-fetcher-google + ${project.version} + com.fasterxml.jackson.module jackson-module-jsonSchema diff --git a/tika-pipes/tika-fetchers/pom.xml b/tika-pipes/tika-fetchers/pom.xml index 999d269fcf6..2fe67146452 100644 --- a/tika-pipes/tika-fetchers/pom.xml +++ b/tika-pipes/tika-fetchers/pom.xml @@ -37,6 +37,7 @@ tika-fetcher-gcs tika-fetcher-az-blob tika-fetcher-microsoft-graph + tika-fetcher-google @@ -45,4 +46,4 @@ 3.0.0-rc1 - \ No newline at end of file + diff --git a/tika-pipes/tika-fetchers/tika-fetcher-google/pom.xml b/tika-pipes/tika-fetchers/tika-fetcher-google/pom.xml new file mode 100644 index 00000000000..963ad8d8f17 --- /dev/null +++ b/tika-pipes/tika-fetchers/tika-fetcher-google/pom.xml @@ -0,0 +1,105 @@ + + + + 4.0.0 + + + tika-fetchers + org.apache.tika + 3.0.0-SNAPSHOT + + + tika-fetcher-google + Google Tika Pipes Fetcher + + + 2.2.0 + + + + + + ${project.groupId} + tika-core + ${project.version} + + + + + com.google.api-client + google-api-client + ${google.api.client.version} + + + + com.google.auth + google-auth-library-oauth2-http + 1.19.0 + + + + + com.google.apis + google-api-services-drive + v3-rev20241027-2.0.0 + + + + + org.slf4j + slf4j-api + + + + + commons-io + commons-io + + + + + org.junit.jupiter + junit-jupiter + test + + + + + + + org.apache.maven.plugins + maven-compiler-plugin + + + + org.apache.tika.pipes.fetcher.s3 + + + + + + + + + 3.0.0-BETA-rc1 + + diff --git a/tika-pipes/tika-fetchers/tika-fetcher-google/src/main/java/org/apache/tika/pipes/fetchers/google/GoogleDriveFetcher.java b/tika-pipes/tika-fetchers/tika-fetcher-google/src/main/java/org/apache/tika/pipes/fetchers/google/GoogleDriveFetcher.java new file mode 100644 index 00000000000..94a21740ee4 --- /dev/null +++ b/tika-pipes/tika-fetchers/tika-fetcher-google/src/main/java/org/apache/tika/pipes/fetchers/google/GoogleDriveFetcher.java @@ -0,0 +1,200 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.pipes.fetchers.google; + +import java.io.ByteArrayInputStream; +import java.io.IOException; +import java.io.InputStream; +import java.nio.file.Files; +import java.nio.file.Path; +import java.util.ArrayList; +import java.util.Base64; +import java.util.List; +import java.util.Map; + +import com.google.api.client.googleapis.javanet.GoogleNetHttpTransport; +import com.google.api.client.http.HttpRequestInitializer; +import com.google.api.client.json.JsonFactory; +import com.google.api.client.json.gson.GsonFactory; +import com.google.api.services.drive.Drive; +import com.google.api.services.drive.DriveScopes; +import com.google.auth.http.HttpCredentialsAdapter; +import com.google.auth.oauth2.GoogleCredentials; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import org.apache.tika.config.Field; +import org.apache.tika.config.Initializable; +import org.apache.tika.config.InitializableProblemHandler; +import org.apache.tika.config.Param; +import org.apache.tika.exception.TikaConfigException; +import org.apache.tika.exception.TikaException; +import org.apache.tika.io.TemporaryResources; +import org.apache.tika.io.TikaInputStream; +import org.apache.tika.metadata.Metadata; +import org.apache.tika.parser.ParseContext; +import org.apache.tika.pipes.fetcher.AbstractFetcher; +import org.apache.tika.pipes.fetchers.google.config.GoogleDriveFetcherConfig; + + +/** + * GoogleDrive Fetcher allows the fetching of files from a Google Drive, using a + * service account key. + * + * Fetch Keys are ${fileId},${subjectUser}, where the subject user is the + * organizer of the file. This user is necessary as part of the key as the + * service account must act on behalf of the user when querying for the file. + */ +public class GoogleDriveFetcher extends AbstractFetcher implements Initializable { + private static final Logger LOGGER = LoggerFactory.getLogger(GoogleDriveFetcher.class); + private static final JsonFactory JSON_FACTORY = GsonFactory.getDefaultInstance(); + + private GoogleCredentials baseCredentials; + + private Drive driveService; + private boolean spoolToTemp; + private List scopes; + + private GoogleDriveFetcherConfig config = new GoogleDriveFetcherConfig(); + + public GoogleDriveFetcher() { + scopes = new ArrayList<>(); + scopes.add(DriveScopes.DRIVE_READONLY); + } + + public GoogleDriveFetcher(GoogleDriveFetcherConfig config) { + this.config = config; + } + + @Field + public void setThrottleSeconds(String commaDelimitedLongs) throws TikaConfigException { + String[] longStrings = (commaDelimitedLongs == null ? "" : commaDelimitedLongs).split(","); + long[] seconds = new long[longStrings.length]; + for (int i = 0; i < longStrings.length; i++) { + try { + seconds[i] = Long.parseLong(longStrings[i]); + } catch (NumberFormatException e) { + throw new TikaConfigException(e.getMessage()); + } + } + setThrottleSeconds(seconds); + } + + public void setThrottleSeconds(long[] throttleSeconds) { + config.setThrottleSeconds(throttleSeconds); + } + + @Field + public void setSpoolToTemp(boolean spoolToTemp) { + config.setSpoolToTemp(spoolToTemp); + } + + @Field + public void setServiceAccountKeyBase64(String serviceAccountKeyBase64) { + config.setServiceAccountKeyBase64(serviceAccountKeyBase64); + } + + @Field + public void setSubjectUser(String subjectUser) { + config.setSubjectUser(subjectUser); + } + + @Field + public void setScopes(List scopes) { + config.setScopes(new ArrayList<>(scopes)); + if (config.getScopes().isEmpty()) { + config.getScopes().add(DriveScopes.DRIVE_READONLY); + } + } + + @Override + public void initialize(Map map) throws TikaConfigException { + try { + baseCredentials = GoogleCredentials + .fromStream(new ByteArrayInputStream(Base64.getDecoder().decode(config.getServiceAccountKeyBase64()))) + .createScoped(scopes); + } catch (IOException e) { + throw new TikaConfigException("Failed to initialize Google Drive service", e); + } + } + + @Override + public void checkInitialization(InitializableProblemHandler initializableProblemHandler) throws TikaConfigException { + } + + @Override + public InputStream fetch(String fetchKey, Metadata metadata, ParseContext parseContext) throws TikaException, IOException { + int tries = 0; + Exception ex = null; + TemporaryResources tmp = null; + + do { + long start = System.currentTimeMillis(); + try { + String[] fetchKeySplit = fetchKey.split(","); + if (fetchKeySplit.length != 2) { + throw new TikaException("Invalid fetch key, expected format ${fileId},${subjectUser}: " + fetchKey); + } + + String fileId = fetchKeySplit[0]; + String subjectUser = fetchKeySplit[1]; + + GoogleCredentials delegatedCredentials = baseCredentials.createDelegated(subjectUser); + final HttpRequestInitializer requestInitializer = new HttpCredentialsAdapter(delegatedCredentials); + + driveService = new Drive.Builder( + GoogleNetHttpTransport.newTrustedTransport(), + JSON_FACTORY, + requestInitializer).setApplicationName("tika-fetcher-google").build(); + + InputStream is = driveService.files() + .get(fileId) + .executeMediaAsInputStream(); + + if (is == null) { + throw new IOException("Empty input stream when we tried to parse " + fetchKey); + } + + if (spoolToTemp) { + tmp = new TemporaryResources(); + Path tmpPath = tmp.createTempFile(fileId + ".dat"); + Files.copy(is, tmpPath); + return TikaInputStream.get(tmpPath); + } + return TikaInputStream.get(is); + + } catch (Exception e) { + LOGGER.warn("Exception fetching on retry=" + tries, e); + ex = e; + } finally { + long elapsed = System.currentTimeMillis() - start; + LOGGER.debug("Total to fetch {}", elapsed); + } + + long[] throttleSeconds = config.getThrottleSeconds(); + + LOGGER.warn("Sleeping for {} seconds before retry", throttleSeconds[tries]); + try { + Thread.sleep(throttleSeconds[tries] * 1000); + } catch (InterruptedException e) { + Thread.currentThread().interrupt(); + } + } while (++tries < config.getThrottleSeconds().length); + + throw new TikaException("Could not fetch " + fetchKey, ex); + } +} diff --git a/tika-pipes/tika-fetchers/tika-fetcher-google/src/main/java/org/apache/tika/pipes/fetchers/google/config/GoogleDriveFetcherConfig.java b/tika-pipes/tika-fetchers/tika-fetcher-google/src/main/java/org/apache/tika/pipes/fetchers/google/config/GoogleDriveFetcherConfig.java new file mode 100644 index 00000000000..f03db469559 --- /dev/null +++ b/tika-pipes/tika-fetchers/tika-fetcher-google/src/main/java/org/apache/tika/pipes/fetchers/google/config/GoogleDriveFetcherConfig.java @@ -0,0 +1,78 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.pipes.fetchers.google.config; + +import java.util.ArrayList; +import java.util.List; + +import org.apache.tika.pipes.fetcher.config.AbstractConfig; + +public class GoogleDriveFetcherConfig extends AbstractConfig { + private long[] throttleSeconds; + private boolean spoolToTemp; + protected String serviceAccountKeyBase64; + protected String subjectUser; + private List scopes = new ArrayList<>(); + + public boolean isSpoolToTemp() { + return spoolToTemp; + } + + public GoogleDriveFetcherConfig setSpoolToTemp(boolean spoolToTemp) { + this.spoolToTemp = spoolToTemp; + return this; + } + + public long[] getThrottleSeconds() { + if (throttleSeconds == null) { + return new long[]{5, 10, 15}; // Default retry intervals + } + return throttleSeconds; + } + + public GoogleDriveFetcherConfig setThrottleSeconds(long[] throttleSeconds) { + this.throttleSeconds = throttleSeconds; + return this; + } + + public String getServiceAccountKeyBase64() { + return serviceAccountKeyBase64; + } + + public GoogleDriveFetcherConfig setServiceAccountKeyBase64(String serviceAccountKeyBase64) { + this.serviceAccountKeyBase64 = serviceAccountKeyBase64; + return this; + } + + public String getSubjectUser() { + return subjectUser; + } + + public GoogleDriveFetcherConfig setSubjectUser(String subjectUser) { + this.subjectUser = subjectUser; + return this; + } + + public List getScopes() { + return scopes; + } + + public GoogleDriveFetcherConfig setScopes(List scopes) { + this.scopes = scopes; + return this; + } +} From e1e7ce46f27108ef045b28bcd72214368d2c1bc3 Mon Sep 17 00:00:00 2001 From: Bartek Ciszkowski Date: Thu, 5 Dec 2024 20:46:14 -0400 Subject: [PATCH 2/3] Target tika-fetcher-google for 4.0.0-SNAPSHOT --- tika-pipes/tika-fetchers/tika-fetcher-google/pom.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tika-pipes/tika-fetchers/tika-fetcher-google/pom.xml b/tika-pipes/tika-fetchers/tika-fetcher-google/pom.xml index 963ad8d8f17..61b9c61be50 100644 --- a/tika-pipes/tika-fetchers/tika-fetcher-google/pom.xml +++ b/tika-pipes/tika-fetchers/tika-fetcher-google/pom.xml @@ -25,7 +25,7 @@ tika-fetchers org.apache.tika - 3.0.0-SNAPSHOT + 4.0.0-SNAPSHOT tika-fetcher-google From c8d3ea7746e7e0e19f87b63d3a7d24acc7c06179 Mon Sep 17 00:00:00 2001 From: Tilman Hausherr Date: Mon, 9 Dec 2024 14:05:21 +0100 Subject: [PATCH 3/3] Cleanup pom.xml files --- tika-grpc/pom.xml | 1 - tika-parent/pom.xml | 30 +++++++++++++++++++ .../tika-fetchers/tika-fetcher-google/pom.xml | 20 +++++++++++-- 3 files changed, 48 insertions(+), 3 deletions(-) diff --git a/tika-grpc/pom.xml b/tika-grpc/pom.xml index 68b90cd17ed..6a1f3f60b40 100644 --- a/tika-grpc/pom.xml +++ b/tika-grpc/pom.xml @@ -223,7 +223,6 @@ tika-fetcher-http ${project.version} - org.apache.tika tika-fetcher-google diff --git a/tika-parent/pom.xml b/tika-parent/pom.xml index d66b13476c2..ec8f0c24a59 100644 --- a/tika-parent/pom.xml +++ b/tika-parent/pom.xml @@ -639,6 +639,16 @@ com.google.guava guava ${guava.version} + + + com.google.errorprone + error_prone_annotations + + + com.google.j2objc + j2objc-annotations + + com.googlecode.json-simple @@ -1048,6 +1058,26 @@ jspecify 1.0.0 + + com.google.http-client + google-http-client + 1.45.2 + + + com.google.http-client + google-http-client-gson + 1.45.2 + + + io.grpc + grpc-context + 1.68.2 + + + com.google.auth + google-auth-library-credentials + 1.30.0 + diff --git a/tika-pipes/tika-fetchers/tika-fetcher-google/pom.xml b/tika-pipes/tika-fetchers/tika-fetcher-google/pom.xml index 61b9c61be50..2f5e97ead6c 100644 --- a/tika-pipes/tika-fetchers/tika-fetcher-google/pom.xml +++ b/tika-pipes/tika-fetchers/tika-fetcher-google/pom.xml @@ -32,7 +32,7 @@ Google Tika Pipes Fetcher - 2.2.0 + 2.7.0 @@ -48,12 +48,28 @@ com.google.api-client google-api-client ${google.api.client.version} + + + com.google.auth + google-auth-library-oauth2-http + + com.google.auth google-auth-library-oauth2-http - 1.19.0 + 1.30.0 + + + com.google.errorprone + error_prone_annotations + + + com.google.j2objc + j2objc-annotations + +