Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions build.sbt
Original file line number Diff line number Diff line change
Expand Up @@ -247,6 +247,10 @@ def buildJavacOptions() = {
"-deprecation",
"-Xlint:dep-ann",
"-Xlint:unchecked",
// Preserve Java parameter names in bytecode so Daffodil's reflection-based
// layer parameter resolution can match method parameters to DFDL variables
// by name (otherwise they appear as arg0/arg1/...).
"-parameters",
"--release",
minSupportedJavaVersion
)
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.daffodil.layers.runtime1;

import java.io.IOException;
import java.io.OutputStream;
import java.util.zip.GZIPOutputStream;

/**
* A {@link GZIPOutputStream} subclass that allows the caller to specify the
* compression level explicitly. The standard {@code GZIPOutputStream} uses
* the JDK's default level (which zlib resolves to level 6), with no
* mechanism to choose a different level. This subclass gives callers control
* over the speed-vs-compression-ratio trade-off that the gzip format supports.
*
* <p>According to the zlib documentation and the logic in {@code deflate.c},
* compression levels range from 0 to 9:
* <ul>
* <li>Level 0: no compression (fastest)</li>
* <li>Level 1: minimal compression, fastest of the compressing levels</li>
* <li>Level 9: best compression, slowest</li>
* </ul>
*
* <p>As the compression level increases, encoding speed decreases. Some
* reports indicate that level 9 is approximately 10x slower than the default
* for only about 16% additional compression.
*/
public final class ConfigurableGZIPOutputStream extends GZIPOutputStream {

/**
* Creates a {@code ConfigurableGZIPOutputStream} that writes compressed
* data to the given output stream at the specified compression level.
*
* @param out the output stream to write compressed data to
* @param level the compression level; an integer in the range 0-9,
* or {@link java.util.zip.Deflater#DEFAULT_COMPRESSION}
* ({@code -1}) for the default
* @throws IOException if an I/O error occurs
* @throws IllegalArgumentException if {@code level} is not a valid compression level
*/
public ConfigurableGZIPOutputStream(OutputStream out, int level) throws IOException {
super(out);
this.def.setLevel(level);
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -22,130 +22,50 @@
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.util.Objects;
import java.util.zip.GZIPInputStream;
import java.util.zip.GZIPOutputStream;
import java.util.zip.Deflater;

public final class GZipLayer extends Layer {

private static Boolean fixNeeded = null;

public static boolean fixIsNeeded() {
if (Objects.isNull(fixNeeded)) {
// prior to java 16
String versionString = System.getProperty("java.version");

// Extract the major version using string manipulation
int majorVersion;
if (versionString.startsWith("1.8")) {
majorVersion = 8;
} else {
String[] parts = versionString.split("\\.");
assert (parts.length > 0);
majorVersion = Integer.parseInt(parts[0]);
}
fixNeeded = (majorVersion < 16);
}
return fixNeeded;
}


private int compressionLevel = Deflater.DEFAULT_COMPRESSION;

public GZipLayer() {
super("gzip", "urn:org.apache.daffodil.layers.gzip");
// convert IOExceptions to Processing Errors
setProcessingErrorException(IOException.class);
}

@Override
public InputStream wrapLayerInput(InputStream jis) throws Exception {
return new GZIPInputStream(jis);
}

@Override
public OutputStream wrapLayerOutput(OutputStream jos) throws Exception {
OutputStream fixedOS = fixIsNeeded() ? new GZIPFixedOutputStream(jos) : jos;
return new GZIPOutputStream(fixedOS);
}

}

/**
* Prior to Java 16, the java.util.zip.GZIPOutputStream wrote a value of zero for
* the OS field in the header (byte index 9).
* In Java 16, this was changed to a
* value of 255 to better abide by the GZIP specification. Unfortunately, this
* means unparsed data using a GZIP layer might have a single byte difference,
* depending on the Java version used. This can lead to inconsistent behavior of
* test failures that expect a certain byte value.
* <p>
* To resolve this issue, we create this GZIPFixedOutputStream. This should wrap
* the underlying OutputStream and be passed as the OutputStream to the
* GZIPOutputStream. When the GZIPOutputStream writes the 9th byte to this
* GZIPFixedOutputStream, this will always write a value of 255, making all Java
* versions prior to 16 consistent with Java 16+ behavior.
*/
class GZIPFixedOutputStream extends OutputStream {

private final OutputStream os;

public GZIPFixedOutputStream(OutputStream os) {
this.os = os;
}

/**
* The next byte position that byte will be written to. If this is negative,
* that means we have already fixed the output and everything should just
* pass straight through.
* Provides the layer's parameter variables to the layer.
*
* <p>The compression level controls the trade-off between encoding speed
* and output size when gzip-compressing data during unparsing. Higher
* levels produce smaller output but take longer to encode.
*
* @param compressionLevel an integer specifying the gzip compression level
* to use when unparsing. Valid values are 0 through
* 9, where 0 means no compression and 9 means maximum
* compression, or {@link Deflater#DEFAULT_COMPRESSION}
* ({@code -1}) to use the underlying zlib library's
* default (level 6).
*/
private int bytePosition = 0;

@Override
public void close() throws IOException {
os.close();
public void setLayerVariableParameters(int compressionLevel) {
boolean isValidRange = (compressionLevel >= 0 && compressionLevel <= 9)
|| compressionLevel == Deflater.DEFAULT_COMPRESSION;
if (!isValidRange)
runtimeSchemaDefinitionError("Invalid compression level: " + compressionLevel);

this.compressionLevel = compressionLevel;
}

@Override
public void flush() throws IOException {
os.flush();
public InputStream wrapLayerInput(InputStream jis) throws Exception {
return new GZIPInputStream(jis);
}

@Override
public void write(byte[] b, int off, int len) throws IOException {
if (bytePosition < 0) {
// The bad byte has been fixed, pass all writes directly through to the
// underlying OutputStream. This may be more efficient than the default
// OutputStream write() function, which writes the bytes from this array
// one at a time
os.write(b, off, len);
} else {
// The bad byte has not been fixed yet. Unless a newer version of Java
// has made changes, the GZIPOutputStreamm will have passed in a 10 byte
// array to this function that includes the bad byte. Let's just write
// that array using the default write(array) method that writes these
// bytes one at a time and will call the write(int) method that will fix
// that byte. Calling write() one at a time is maybe inefficient but for
// such a small array it should not have a noticeable effect.
super.write(b, off, len);
}
public OutputStream wrapLayerOutput(OutputStream jos) throws Exception {
return new ConfigurableGZIPOutputStream(jos, compressionLevel);
}

@Override
public void write(int b) throws IOException {
if (bytePosition < 0) {
// The bad byte has already been fixed, simply pass this byte through to
// the underlying OutputStream
os.write(b);
} else if (bytePosition < 9) {
// The bad byte has not been fixed, and we haven't reached it yet, simply
// pass this byte through and increment our byte position
os.write(b);
bytePosition += 1;
} else if (bytePosition == 9) {
// This is the bad byte, it is a 0 on some Java versions. Write 255
// instead of to match Java 16+ behavior. Also, set bytePosition to -1 to
// signify that we have fixed the bad byte and that all other writes
// should just pass directly to the underlying OutputStream
os.write(255);
bytePosition = -1;
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -26,8 +26,8 @@
<annotation>
<appinfo source="http://www.ogf.org/dfdl/">

<!-- the gzip layer has no parameter variables
nor return result variables. -->
<!-- gzip layer variable-->
<dfdl:defineVariable name="compressionLevel" type="xs:int" defaultValue="-1" external="true"/>

</appinfo>
</annotation>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ package org.apache.daffodil.core.layers

import java.io.ByteArrayOutputStream
import java.nio.charset.StandardCharsets
import java.util.zip.Deflater
import scala.xml.Elem

import org.apache.daffodil.core.util.FuzzOneBits
Expand All @@ -27,6 +28,7 @@ import org.apache.daffodil.core.util.FuzzRandomSingleByte
import org.apache.daffodil.core.util.FuzzZeroBits
import org.apache.daffodil.core.util.LayerParseTester
import org.apache.daffodil.core.util.TestUtils
import org.apache.daffodil.layers.runtime1.ConfigurableGZIPOutputStream
import org.apache.daffodil.lib.util.*
import org.apache.daffodil.lib.xml.XMLUtils

Expand Down Expand Up @@ -85,14 +87,15 @@ a few lines of pointless text like this.""".replace("\r\n", "\n").replace("\n",

def makeGZIPData(text: String) = {
val baos = new ByteArrayOutputStream()
val gzos = new java.util.zip.GZIPOutputStream(baos)
// Level 9 happens to produce identical deflate output on both stock zlib and
// zlib-ng for this particular test data, letting the hardcoded baselines below
// match regardless of which implementation the JVM uses. This is empirical and
// could break with different inputs or future zlib versions.
val gzos = new ConfigurableGZIPOutputStream(baos, Deflater.BEST_COMPRESSION)

IOUtils.write(text, gzos, StandardCharsets.UTF_8)
gzos.close()
val data = baos.toByteArray()
// Java 16+ sets the 9th byte to 0xFF, but previous Java versions set the
// value to 0x00. Daffodil always unparses with 0xFF regardless of Java
// version, so force the gzip data to 0xFF to make sure tests round trip
data(9) = 0xff.toByte
val data = baos.toByteArray
data
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -177,11 +177,7 @@ class TestLayers {
val gzos = new java.util.zip.GZIPOutputStream(baos)
IOUtils.write(text, gzos, StandardCharsets.UTF_8)
gzos.close()
val data = baos.toByteArray()
// Java 16+ sets the 9th byte to 0xFF, but previous Java versions set the
// value to 0x00. Daffodil always unparses with 0xFF regardless of Java
// version, so force the gzip data to 0xFF to make sure tests round trip
data(9) = 0xff.toByte
val data = baos.toByteArray
data
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,43 +17,14 @@

package org.apache.daffodil.layers.runtime1

import java.io.ByteArrayOutputStream
import java.util.zip.GZIPOutputStream

import org.apache.commons.io.IOUtils
import org.junit.Assert.*
import org.junit.Test

class TestGzipLayer {

val gl = new GZipLayer // default constructor has to work.

@Test def testConstruction() = {
@Test def testConstruction(): Unit = {
assertNotNull(gl)
}

/**
* Tests that our gzip fixer fixes up byte 9 of the gzip header
* so that it has 255 in it for java versions prior to Java 16
*/
@Test def testByte9() = {
val baos = new ByteArrayOutputStream()
val baosRaw = new ByteArrayOutputStream()
val gzos = gl.wrapLayerOutput(baos)
val gzosRaw = new GZIPOutputStream(baosRaw)
val data = "testing 1, 2, 3"
IOUtils.write(data, gzos, "ascii")
IOUtils.write(data, gzosRaw, "ascii")
gzos.close()
gzosRaw.close()
val byte9 = baos.toByteArray.apply(9)
val byte9Raw = baosRaw.toByteArray.apply(9)
if (GZipLayer.fixIsNeeded()) {
assertEquals(0, byte9Raw.toInt & 0xff)
assertEquals(255, byte9.toInt & 0xff)
} else {
assertEquals(255, byte9Raw.toInt & 0xff)
assertEquals(255, byte9.toInt & 0xff)
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,17 @@
<element name="contents">
<complexType>
<sequence dfdlx:layer="gz:gzip">
<annotation>
<documentation>
Level 9 (slowest) is used here only to improve the likelihood of
deterministic gzip output across different JVMs and zlib implementations
during testing. Real-world users should either omit this newVariableInstance
to use the default level (6) or choose a level appropriate for their needs.
</documentation>
<appinfo source="http://www.ogf.org/dfdl/">
<dfdl:newVariableInstance ref="gz:compressionLevel" defaultValue="9"/>
Comment thread
gdesrosiers1805 marked this conversation as resolved.
</appinfo>
</annotation>
<group ref="ex:compressedGroupContents"/>
</sequence>
</complexType>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -93,6 +93,17 @@
now the gzipped layered sequence itself
-->
<sequence dfdlx:layer="gz:gzip">
<annotation>
<documentation>
Level 9 (slowest) is used here only to improve the likelihood of
deterministic gzip output across different JVMs and zlib implementations
during testing. Real-world users should either omit this newVariableInstance
to use the default level (6) or choose a level appropriate for their needs.
</documentation>
<appinfo source="http://www.ogf.org/dfdl/">
<dfdl:newVariableInstance ref="gz:compressionLevel" defaultValue="9"/>
</appinfo>
</annotation>
<!--
finally, inside that, we have the original fileTypeGroup group reference.
-->
Expand Down
Loading