From cf5db56e420b9f67ae1f9d0f0c0fe38df38ffe2b Mon Sep 17 00:00:00 2001 From: Guichard Desrosiers Date: Tue, 26 May 2026 19:45:54 -0400 Subject: [PATCH] Add configurable gzip compression level The gzip layer previously used GZIPOutputStream with the JDK's default compression level (level 6), with no way for schemas to choose a different level. This change exposes the speed-vs-size trade-off that the gzip format already supports, giving users control over how their data is compressed during unparsing. As a side benefit, this fix also addresses cross-zlib test failures observed on Fedora 42. Fedora's OpenJDK links against zlib-ng while Temurin and most other distributions bundle stock zlib. Both produce valid gzip output, but the deflate token streams differ due to different match-finding heuristics, causing tests with hardcoded byte baselines to fail on Fedora. For the small test data used in Daffodil's gzip tests, level 9 happens to produce identical output on both implementations, letting the tests pass regardless of which zlib is linked. This convergence at level 9 is empirical, not guaranteed. Changes: - Add `compressionLevel` DFDL variable to the gzip layer schema, defaulting to Deflater.DEFAULT_COMPRESSION (-1). Schemas can override via `dfdl:newVariableInstance` or `dfdl:setVariable`, and users can set the value externally without having to modify the schema. - Add ConfigurableGZIPOutputStream, a GZIPOutputStream subclass that allows the compression level to be set via constructor argument. - Update GZipLayer to accept the compressionLevel variable via setLayerVariableParameters and use it when constructing the output stream. - Remove GZIPFixedOutputStream and the associated fixIsNeeded() method. These existed to work around a pre-Java-16 bug where GZIPOutputStream wrote 0x00 instead of the spec-compliant 0xFF for the gzip OS header byte. Since Daffodil's minimum supported Java version is now 17, this workaround is no longer needed. - Add `-parameters` to javac options This preserves Java parameter names in bytecode, which is required by Daffodil's reflection-based layer parameter resolution. Without this, Java setters appear with parameters named arg0/arg1/... and cannot be matched to schema variables. - Update test schemas (exampleGzipLayer.dfdl.xsd, exampleGzipLayer2.dfdl.xsd) to set compressionLevel=9 via newVariableInstance. - Update TestGzipErrors.makeGZIPData to generate test input data at level 9 for byte-stable output across JVMs, with comments documenting the empirical nature of the convergence. DAFFODIL-3082 --- build.sbt | 4 + .../ConfigurableGZIPOutputStream.java | 59 ++++++++ .../daffodil/layers/runtime1/GZipLayer.java | 136 ++++-------------- .../daffodil/layers/xsd/gzipLayer.dfdl.xsd | 4 +- .../daffodil/core/layers/TestGzipErrors.scala | 15 +- .../daffodil/core/layers/TestLayers.scala | 6 +- .../layers/runtime1/TestGzipLayer.scala | 31 +--- .../daffodil/layers/exampleGzipLayer.dfdl.xsd | 11 ++ .../layers/exampleGzipLayer2.dfdl.xsd | 11 ++ 9 files changed, 126 insertions(+), 151 deletions(-) create mode 100644 daffodil-core/src/main/java/org/apache/daffodil/layers/runtime1/ConfigurableGZIPOutputStream.java diff --git a/build.sbt b/build.sbt index 38d910eacb..b9f9801745 100644 --- a/build.sbt +++ b/build.sbt @@ -247,6 +247,10 @@ def buildJavacOptions() = { "-deprecation", "-Xlint:dep-ann", "-Xlint:unchecked", + // Preserve Java parameter names in bytecode so Daffodil's reflection-based + // layer parameter resolution can match method parameters to DFDL variables + // by name (otherwise they appear as arg0/arg1/...). + "-parameters", "--release", minSupportedJavaVersion ) diff --git a/daffodil-core/src/main/java/org/apache/daffodil/layers/runtime1/ConfigurableGZIPOutputStream.java b/daffodil-core/src/main/java/org/apache/daffodil/layers/runtime1/ConfigurableGZIPOutputStream.java new file mode 100644 index 0000000000..6b042fb645 --- /dev/null +++ b/daffodil-core/src/main/java/org/apache/daffodil/layers/runtime1/ConfigurableGZIPOutputStream.java @@ -0,0 +1,59 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.daffodil.layers.runtime1; + +import java.io.IOException; +import java.io.OutputStream; +import java.util.zip.GZIPOutputStream; + +/** + * A {@link GZIPOutputStream} subclass that allows the caller to specify the + * compression level explicitly. The standard {@code GZIPOutputStream} uses + * the JDK's default level (which zlib resolves to level 6), with no + * mechanism to choose a different level. This subclass gives callers control + * over the speed-vs-compression-ratio trade-off that the gzip format supports. + * + *

According to the zlib documentation and the logic in {@code deflate.c}, + * compression levels range from 0 to 9: + *

+ * + *

As the compression level increases, encoding speed decreases. Some + * reports indicate that level 9 is approximately 10x slower than the default + * for only about 16% additional compression. + */ +public final class ConfigurableGZIPOutputStream extends GZIPOutputStream { + + /** + * Creates a {@code ConfigurableGZIPOutputStream} that writes compressed + * data to the given output stream at the specified compression level. + * + * @param out the output stream to write compressed data to + * @param level the compression level; an integer in the range 0-9, + * or {@link java.util.zip.Deflater#DEFAULT_COMPRESSION} + * ({@code -1}) for the default + * @throws IOException if an I/O error occurs + * @throws IllegalArgumentException if {@code level} is not a valid compression level + */ + public ConfigurableGZIPOutputStream(OutputStream out, int level) throws IOException { + super(out); + this.def.setLevel(level); + } +} \ No newline at end of file diff --git a/daffodil-core/src/main/java/org/apache/daffodil/layers/runtime1/GZipLayer.java b/daffodil-core/src/main/java/org/apache/daffodil/layers/runtime1/GZipLayer.java index df37f95ff9..bfab2b3373 100644 --- a/daffodil-core/src/main/java/org/apache/daffodil/layers/runtime1/GZipLayer.java +++ b/daffodil-core/src/main/java/org/apache/daffodil/layers/runtime1/GZipLayer.java @@ -22,130 +22,50 @@ import java.io.IOException; import java.io.InputStream; import java.io.OutputStream; -import java.util.Objects; import java.util.zip.GZIPInputStream; -import java.util.zip.GZIPOutputStream; +import java.util.zip.Deflater; public final class GZipLayer extends Layer { - - private static Boolean fixNeeded = null; - - public static boolean fixIsNeeded() { - if (Objects.isNull(fixNeeded)) { - // prior to java 16 - String versionString = System.getProperty("java.version"); - - // Extract the major version using string manipulation - int majorVersion; - if (versionString.startsWith("1.8")) { - majorVersion = 8; - } else { - String[] parts = versionString.split("\\."); - assert (parts.length > 0); - majorVersion = Integer.parseInt(parts[0]); - } - fixNeeded = (majorVersion < 16); - } - return fixNeeded; - } - + + private int compressionLevel = Deflater.DEFAULT_COMPRESSION; + public GZipLayer() { super("gzip", "urn:org.apache.daffodil.layers.gzip"); // convert IOExceptions to Processing Errors setProcessingErrorException(IOException.class); } - @Override - public InputStream wrapLayerInput(InputStream jis) throws Exception { - return new GZIPInputStream(jis); - } - - @Override - public OutputStream wrapLayerOutput(OutputStream jos) throws Exception { - OutputStream fixedOS = fixIsNeeded() ? new GZIPFixedOutputStream(jos) : jos; - return new GZIPOutputStream(fixedOS); - } - -} - -/** - * Prior to Java 16, the java.util.zip.GZIPOutputStream wrote a value of zero for - * the OS field in the header (byte index 9). - * In Java 16, this was changed to a - * value of 255 to better abide by the GZIP specification. Unfortunately, this - * means unparsed data using a GZIP layer might have a single byte difference, - * depending on the Java version used. This can lead to inconsistent behavior of - * test failures that expect a certain byte value. - *

- * To resolve this issue, we create this GZIPFixedOutputStream. This should wrap - * the underlying OutputStream and be passed as the OutputStream to the - * GZIPOutputStream. When the GZIPOutputStream writes the 9th byte to this - * GZIPFixedOutputStream, this will always write a value of 255, making all Java - * versions prior to 16 consistent with Java 16+ behavior. - */ -class GZIPFixedOutputStream extends OutputStream { - - private final OutputStream os; - - public GZIPFixedOutputStream(OutputStream os) { - this.os = os; - } - /** - * The next byte position that byte will be written to. If this is negative, - * that means we have already fixed the output and everything should just - * pass straight through. + * Provides the layer's parameter variables to the layer. + * + *

The compression level controls the trade-off between encoding speed + * and output size when gzip-compressing data during unparsing. Higher + * levels produce smaller output but take longer to encode. + * + * @param compressionLevel an integer specifying the gzip compression level + * to use when unparsing. Valid values are 0 through + * 9, where 0 means no compression and 9 means maximum + * compression, or {@link Deflater#DEFAULT_COMPRESSION} + * ({@code -1}) to use the underlying zlib library's + * default (level 6). */ - private int bytePosition = 0; - - @Override - public void close() throws IOException { - os.close(); + public void setLayerVariableParameters(int compressionLevel) { + boolean isValidRange = (compressionLevel >= 0 && compressionLevel <= 9) + || compressionLevel == Deflater.DEFAULT_COMPRESSION; + if (!isValidRange) + runtimeSchemaDefinitionError("Invalid compression level: " + compressionLevel); + + this.compressionLevel = compressionLevel; } - + @Override - public void flush() throws IOException { - os.flush(); + public InputStream wrapLayerInput(InputStream jis) throws Exception { + return new GZIPInputStream(jis); } @Override - public void write(byte[] b, int off, int len) throws IOException { - if (bytePosition < 0) { - // The bad byte has been fixed, pass all writes directly through to the - // underlying OutputStream. This may be more efficient than the default - // OutputStream write() function, which writes the bytes from this array - // one at a time - os.write(b, off, len); - } else { - // The bad byte has not been fixed yet. Unless a newer version of Java - // has made changes, the GZIPOutputStreamm will have passed in a 10 byte - // array to this function that includes the bad byte. Let's just write - // that array using the default write(array) method that writes these - // bytes one at a time and will call the write(int) method that will fix - // that byte. Calling write() one at a time is maybe inefficient but for - // such a small array it should not have a noticeable effect. - super.write(b, off, len); - } + public OutputStream wrapLayerOutput(OutputStream jos) throws Exception { + return new ConfigurableGZIPOutputStream(jos, compressionLevel); } - @Override - public void write(int b) throws IOException { - if (bytePosition < 0) { - // The bad byte has already been fixed, simply pass this byte through to - // the underlying OutputStream - os.write(b); - } else if (bytePosition < 9) { - // The bad byte has not been fixed, and we haven't reached it yet, simply - // pass this byte through and increment our byte position - os.write(b); - bytePosition += 1; - } else if (bytePosition == 9) { - // This is the bad byte, it is a 0 on some Java versions. Write 255 - // instead of to match Java 16+ behavior. Also, set bytePosition to -1 to - // signify that we have fixed the bad byte and that all other writes - // should just pass directly to the underlying OutputStream - os.write(255); - bytePosition = -1; - } - } } diff --git a/daffodil-core/src/main/resources/org/apache/daffodil/layers/xsd/gzipLayer.dfdl.xsd b/daffodil-core/src/main/resources/org/apache/daffodil/layers/xsd/gzipLayer.dfdl.xsd index 30aca2ae04..d8a8a802fb 100644 --- a/daffodil-core/src/main/resources/org/apache/daffodil/layers/xsd/gzipLayer.dfdl.xsd +++ b/daffodil-core/src/main/resources/org/apache/daffodil/layers/xsd/gzipLayer.dfdl.xsd @@ -26,8 +26,8 @@ - + + diff --git a/daffodil-core/src/test/scala/org/apache/daffodil/core/layers/TestGzipErrors.scala b/daffodil-core/src/test/scala/org/apache/daffodil/core/layers/TestGzipErrors.scala index 474d2b9644..508f975a09 100644 --- a/daffodil-core/src/test/scala/org/apache/daffodil/core/layers/TestGzipErrors.scala +++ b/daffodil-core/src/test/scala/org/apache/daffodil/core/layers/TestGzipErrors.scala @@ -19,6 +19,7 @@ package org.apache.daffodil.core.layers import java.io.ByteArrayOutputStream import java.nio.charset.StandardCharsets +import java.util.zip.Deflater import scala.xml.Elem import org.apache.daffodil.core.util.FuzzOneBits @@ -27,6 +28,7 @@ import org.apache.daffodil.core.util.FuzzRandomSingleByte import org.apache.daffodil.core.util.FuzzZeroBits import org.apache.daffodil.core.util.LayerParseTester import org.apache.daffodil.core.util.TestUtils +import org.apache.daffodil.layers.runtime1.ConfigurableGZIPOutputStream import org.apache.daffodil.lib.util.* import org.apache.daffodil.lib.xml.XMLUtils @@ -85,14 +87,15 @@ a few lines of pointless text like this.""".replace("\r\n", "\n").replace("\n", def makeGZIPData(text: String) = { val baos = new ByteArrayOutputStream() - val gzos = new java.util.zip.GZIPOutputStream(baos) + // Level 9 happens to produce identical deflate output on both stock zlib and + // zlib-ng for this particular test data, letting the hardcoded baselines below + // match regardless of which implementation the JVM uses. This is empirical and + // could break with different inputs or future zlib versions. + val gzos = new ConfigurableGZIPOutputStream(baos, Deflater.BEST_COMPRESSION) + IOUtils.write(text, gzos, StandardCharsets.UTF_8) gzos.close() - val data = baos.toByteArray() - // Java 16+ sets the 9th byte to 0xFF, but previous Java versions set the - // value to 0x00. Daffodil always unparses with 0xFF regardless of Java - // version, so force the gzip data to 0xFF to make sure tests round trip - data(9) = 0xff.toByte + val data = baos.toByteArray data } diff --git a/daffodil-core/src/test/scala/org/apache/daffodil/core/layers/TestLayers.scala b/daffodil-core/src/test/scala/org/apache/daffodil/core/layers/TestLayers.scala index fe4375130d..832601b625 100644 --- a/daffodil-core/src/test/scala/org/apache/daffodil/core/layers/TestLayers.scala +++ b/daffodil-core/src/test/scala/org/apache/daffodil/core/layers/TestLayers.scala @@ -177,11 +177,7 @@ class TestLayers { val gzos = new java.util.zip.GZIPOutputStream(baos) IOUtils.write(text, gzos, StandardCharsets.UTF_8) gzos.close() - val data = baos.toByteArray() - // Java 16+ sets the 9th byte to 0xFF, but previous Java versions set the - // value to 0x00. Daffodil always unparses with 0xFF regardless of Java - // version, so force the gzip data to 0xFF to make sure tests round trip - data(9) = 0xff.toByte + val data = baos.toByteArray data } diff --git a/daffodil-core/src/test/scala/org/apache/daffodil/layers/runtime1/TestGzipLayer.scala b/daffodil-core/src/test/scala/org/apache/daffodil/layers/runtime1/TestGzipLayer.scala index 4d770bc697..1426a16c6e 100644 --- a/daffodil-core/src/test/scala/org/apache/daffodil/layers/runtime1/TestGzipLayer.scala +++ b/daffodil-core/src/test/scala/org/apache/daffodil/layers/runtime1/TestGzipLayer.scala @@ -17,10 +17,6 @@ package org.apache.daffodil.layers.runtime1 -import java.io.ByteArrayOutputStream -import java.util.zip.GZIPOutputStream - -import org.apache.commons.io.IOUtils import org.junit.Assert.* import org.junit.Test @@ -28,32 +24,7 @@ class TestGzipLayer { val gl = new GZipLayer // default constructor has to work. - @Test def testConstruction() = { + @Test def testConstruction(): Unit = { assertNotNull(gl) } - - /** - * Tests that our gzip fixer fixes up byte 9 of the gzip header - * so that it has 255 in it for java versions prior to Java 16 - */ - @Test def testByte9() = { - val baos = new ByteArrayOutputStream() - val baosRaw = new ByteArrayOutputStream() - val gzos = gl.wrapLayerOutput(baos) - val gzosRaw = new GZIPOutputStream(baosRaw) - val data = "testing 1, 2, 3" - IOUtils.write(data, gzos, "ascii") - IOUtils.write(data, gzosRaw, "ascii") - gzos.close() - gzosRaw.close() - val byte9 = baos.toByteArray.apply(9) - val byte9Raw = baosRaw.toByteArray.apply(9) - if (GZipLayer.fixIsNeeded()) { - assertEquals(0, byte9Raw.toInt & 0xff) - assertEquals(255, byte9.toInt & 0xff) - } else { - assertEquals(255, byte9Raw.toInt & 0xff) - assertEquals(255, byte9.toInt & 0xff) - } - } } diff --git a/daffodil-test/src/test/resources/org/apache/daffodil/layers/exampleGzipLayer.dfdl.xsd b/daffodil-test/src/test/resources/org/apache/daffodil/layers/exampleGzipLayer.dfdl.xsd index be49050ec9..455d93dce9 100644 --- a/daffodil-test/src/test/resources/org/apache/daffodil/layers/exampleGzipLayer.dfdl.xsd +++ b/daffodil-test/src/test/resources/org/apache/daffodil/layers/exampleGzipLayer.dfdl.xsd @@ -59,6 +59,17 @@ + + + Level 9 (slowest) is used here only to improve the likelihood of + deterministic gzip output across different JVMs and zlib implementations + during testing. Real-world users should either omit this newVariableInstance + to use the default level (6) or choose a level appropriate for their needs. + + + + + diff --git a/daffodil-test/src/test/resources/org/apache/daffodil/layers/exampleGzipLayer2.dfdl.xsd b/daffodil-test/src/test/resources/org/apache/daffodil/layers/exampleGzipLayer2.dfdl.xsd index 976492ca83..62d48db9ca 100644 --- a/daffodil-test/src/test/resources/org/apache/daffodil/layers/exampleGzipLayer2.dfdl.xsd +++ b/daffodil-test/src/test/resources/org/apache/daffodil/layers/exampleGzipLayer2.dfdl.xsd @@ -93,6 +93,17 @@ now the gzipped layered sequence itself --> + + + Level 9 (slowest) is used here only to improve the likelihood of + deterministic gzip output across different JVMs and zlib implementations + during testing. Real-world users should either omit this newVariableInstance + to use the default level (6) or choose a level appropriate for their needs. + + + + +