Skip to content

Commit 59e421f

Browse files
committed
Merge branch 'feature/file_chunk_starts_with_BOM' into develop
Resolves #64
2 parents ee1facd + 4a88939 commit 59e421f

3 files changed

Lines changed: 160 additions & 8 deletions

File tree

Lines changed: 89 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,89 @@
1+
using System.Collections.Generic;
2+
using System.IO;
3+
using System.Text;
4+
using Xunit;
5+
6+
namespace HttpMultipartParser.UnitTests.ParserScenarios
7+
{
8+
// This test attempts to reproduce the bug discussed here:
9+
// https://github.com/Http-Multipart-Data-Parser/Http-Multipart-Data-Parser/issues/64
10+
public class FileChunkStartsWithBOM
11+
{
12+
private static readonly int _binaryBufferSize = 100;
13+
private static readonly byte[] _utf8BOMBinary = new byte[] { 0xef, 0xbb, 0xbf };
14+
private static readonly string _utf8BOMString = Encoding.UTF8.GetString(_utf8BOMBinary);
15+
16+
private static readonly string _prefix =
17+
@"--boundary
18+
Content-Type: application/octet-stream
19+
20+
";
21+
// This padding ensures that the BOM is positioned at the begining of the second chunk
22+
private static readonly string _padding = new string('+', _binaryBufferSize - _prefix.Length);
23+
private static readonly string _fileContent = $"{_padding}{_utf8BOMString}Hello world";
24+
25+
private static readonly string _testDataFormat =
26+
@"{0}{1}
27+
--boundary--";
28+
private static readonly string _testData = TestUtil.TrimAllLines(string.Format(_testDataFormat, _prefix, _fileContent));
29+
30+
/// <summary>
31+
/// Test case for files with additional parameter.
32+
/// </summary>
33+
private static readonly TestData _testCase = new TestData(
34+
_testData,
35+
new List<ParameterPart> { },
36+
new List<FilePart> {
37+
new FilePart(null, null, TestUtil.StringToStreamNoBom(_fileContent), null, "application/octet-stream", "form-data")
38+
}
39+
);
40+
41+
/// <summary>
42+
/// Initializes the test data before each run, this primarily
43+
/// consists of resetting data stream positions.
44+
/// </summary>
45+
public FileChunkStartsWithBOM()
46+
{
47+
foreach (var filePart in _testCase.ExpectedFileData)
48+
{
49+
filePart.Data.Position = 0;
50+
}
51+
}
52+
53+
[Fact]
54+
public void CanHandleBOM()
55+
{
56+
var encoding = Encoding.UTF8;
57+
58+
using (Stream stream = TestUtil.StringToStream(_testCase.Request, encoding))
59+
{
60+
var parser = MultipartFormDataParser.Parse(stream, encoding, _binaryBufferSize);
61+
62+
using (var file = parser.Files[0].Data)
63+
{
64+
file.Position = 0;
65+
66+
// Read the padding and assert we get the expected value
67+
var paddingBuffer = new byte[_padding.Length];
68+
var readCount = file.Read(paddingBuffer, 0, paddingBuffer.Length);
69+
70+
Assert.Equal(_padding, encoding.GetString(paddingBuffer));
71+
72+
// Read the BOM and assert we get the expected value
73+
var bomBuffer = new byte[_utf8BOMBinary.Length];
74+
readCount = file.Read(bomBuffer, 0, bomBuffer.Length);
75+
76+
// If this assertion fails, it means that we have reproduced the problem described in GH-64
77+
// If it succeeds, it means that the bug has been fixed.
78+
Assert.Equal(_utf8BOMString, encoding.GetString(bomBuffer));
79+
80+
// Read the rest of the content and assert we get the expected value
81+
var restOfContentBuffer = new byte[_fileContent.Length - _padding.Length - _utf8BOMString.Length];
82+
readCount = file.Read(restOfContentBuffer, 0, restOfContentBuffer.Length);
83+
84+
Assert.Equal("Hello world", encoding.GetString(restOfContentBuffer));
85+
}
86+
}
87+
}
88+
}
89+
}

Source/HttpMultipartParser.UnitTests/RebufferableBinaryReaderUnitTests.cs

Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -67,6 +67,48 @@ public void CanReadMixedAsciiAndUTFCharactersOverBuffers()
6767
Assert.Equal('g', reader.Read());
6868
}
6969

70+
[Theory]
71+
[InlineData(new byte[] { 0xef, 0xbb, 0xbf }, "utf-8")] // UTF-8
72+
[InlineData(new byte[] { 0xff, 0xfe }, "utf-16")] // UTF-16
73+
[InlineData(new byte[] { 0xfe, 0xff }, "utf-16BE")] // UTF-16 Big Endian
74+
[InlineData(new byte[] { 0xfe, 0xff }, "utf-32")] // UTF-32
75+
[InlineData(new byte[] { 0x00, 0x00, 0xfe, 0xff }, "utf-32BE")] // UTF-32 Big Endian
76+
public void CanReadBOM(byte[] bom, string encodingName)
77+
{
78+
var encoding = Encoding.GetEncoding(encodingName);
79+
80+
var prefixString = "Foo Bar";
81+
var prefixBinary = encoding.GetBytes(prefixString);
82+
83+
var sufixString = "Hello world";
84+
var sufixBinary = encoding.GetBytes(sufixString);
85+
86+
var stream = new MemoryStream();
87+
var binaryWriter = new BinaryWriter(stream);
88+
binaryWriter.Write(prefixBinary);
89+
binaryWriter.Write(bom);
90+
binaryWriter.Write(sufixBinary);
91+
binaryWriter.Flush();
92+
stream.Position = 0;
93+
94+
var reader = new RebufferableBinaryReader(stream, encoding);
95+
96+
var buffer = new byte[prefixBinary.Length];
97+
reader.Read(buffer, 0, buffer.Length);
98+
Assert.Equal(prefixString, encoding.GetString(buffer));
99+
100+
buffer = new byte[bom.Length];
101+
reader.Read(buffer, 0, buffer.Length);
102+
for (int i = 0; i < buffer.Length; i++)
103+
{
104+
Assert.Equal(bom[i], buffer[i]);
105+
}
106+
107+
buffer = new byte[sufixBinary.Length];
108+
reader.Read(buffer, 0, buffer.Length);
109+
Assert.Equal(sufixString, encoding.GetString(buffer));
110+
}
111+
70112
#endregion
71113

72114
#region ReadAsync() Tests

Source/HttpMultipartParser/RebufferableBinaryReader.cs

Lines changed: 29 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -61,6 +61,11 @@ public class RebufferableBinaryReader
6161
/// </summary>
6262
private readonly BinaryStreamStack streamStack;
6363

64+
/// <summary>
65+
/// Counts the number of chunks read from the underlying stream.
66+
/// </summary>
67+
private int processedChunkCounter;
68+
6469
#endregion
6570

6671
#region Constructors and Destructors
@@ -73,7 +78,7 @@ public class RebufferableBinaryReader
7378
/// The input stream to read from.
7479
/// </param>
7580
public RebufferableBinaryReader(Stream input)
76-
: this(input, new UTF8Encoding(false))
81+
: this(input, Encoding.UTF8)
7782
{
7883
}
7984

@@ -95,6 +100,7 @@ public RebufferableBinaryReader(Stream input, Encoding encoding, int bufferSize
95100
streamStack = new BinaryStreamStack(encoding);
96101
this.encoding = encoding;
97102
this.bufferSize = bufferSize;
103+
processedChunkCounter = 0;
98104
}
99105

100106
#endregion
@@ -529,15 +535,30 @@ private async Task<int> StreamDataAsync(CancellationToken cancellationToken = de
529535
/// </param>
530536
private void PushToStack(byte[] buffer, int amountRead)
531537
{
532-
// We need to check if our stream is using our encodings
533-
// BOM, if it is we need to jump it.
534-
int bomOffset = GetBomOffset(buffer);
538+
/*
539+
The logic in this method until August 2025 would eliminate the BOM (also called the encoding preamble)
540+
if it was present at the begining of each and every buffer read from the stream.
535541
536-
// Sometimes we'll get a buffer that's smaller then we expect, chop it down
537-
// for the reader:
538-
if (amountRead - bomOffset > 0)
542+
However, we only need to remove the BOM if present at the very begining of the stream.
543+
In other words: only remove the BOM from the first chunk.
544+
*/
545+
546+
if (amountRead > 0)
539547
{
540-
streamStack.Push(buffer, bomOffset, amountRead - bomOffset);
548+
if (processedChunkCounter == 0)
549+
{
550+
int bomOffset = GetBomOffset(buffer);
551+
if (amountRead - bomOffset > 0)
552+
{
553+
streamStack.Push(buffer, bomOffset, amountRead - bomOffset);
554+
}
555+
}
556+
else
557+
{
558+
streamStack.Push(buffer, 0, amountRead);
559+
}
560+
561+
processedChunkCounter++;
541562
}
542563
}
543564

0 commit comments

Comments
 (0)