-
-
Save ForeverZer0/a2cd292bd2f3b5e114956c00bb6e872b to your computer and use it in GitHub Desktop.
| using System; | |
| using System.IO; | |
| using System.IO.Compression; | |
| using System.Text; | |
| namespace TarExample | |
| { | |
| public class Tar | |
| { | |
| /// <summary> | |
| /// Extracts a <i>.tar.gz</i> archive to the specified directory. | |
| /// </summary> | |
| /// <param name="filename">The <i>.tar.gz</i> to decompress and extract.</param> | |
| /// <param name="outputDir">Output directory to write the files.</param> | |
| public static void ExtractTarGz(string filename, string outputDir) | |
| { | |
| using (var stream = File.OpenRead(filename)) | |
| ExtractTarGz(stream, outputDir); | |
| } | |
| /// <summary> | |
| /// Extracts a <i>.tar.gz</i> archive stream to the specified directory. | |
| /// </summary> | |
| /// <param name="stream">The <i>.tar.gz</i> to decompress and extract.</param> | |
| /// <param name="outputDir">Output directory to write the files.</param> | |
| public static void ExtractTarGz(Stream stream, string outputDir) | |
| { | |
| // A GZipStream is not seekable, so copy it first to a MemoryStream | |
| using (var gzip = new GZipStream(stream, CompressionMode.Decompress)) | |
| { | |
| const int chunk = 4096; | |
| using (var memStr = new MemoryStream()) | |
| { | |
| int read; | |
| var buffer = new byte[chunk]; | |
| do | |
| { | |
| read = gzip.Read(buffer, 0, chunk); | |
| memStr.Write(buffer, 0, read); | |
| } while (read == chunk); | |
| memStr.Seek(0, SeekOrigin.Begin); | |
| ExtractTar(memStr, outputDir); | |
| } | |
| } | |
| } | |
| /// <summary> | |
| /// Extractes a <c>tar</c> archive to the specified directory. | |
| /// </summary> | |
| /// <param name="filename">The <i>.tar</i> to extract.</param> | |
| /// <param name="outputDir">Output directory to write the files.</param> | |
| public static void ExtractTar(string filename, string outputDir) | |
| { | |
| using (var stream = File.OpenRead(filename)) | |
| ExtractTar(stream, outputDir); | |
| } | |
| /// <summary> | |
| /// Extractes a <c>tar</c> archive to the specified directory. | |
| /// </summary> | |
| /// <param name="stream">The <i>.tar</i> to extract.</param> | |
| /// <param name="outputDir">Output directory to write the files.</param> | |
| public static void ExtractTar(Stream stream, string outputDir) | |
| { | |
| var buffer = new byte[100]; | |
| while (true) | |
| { | |
| stream.Read(buffer, 0, 100); | |
| var name = Encoding.ASCII.GetString(buffer).Trim('\0'); | |
| if (String.IsNullOrWhiteSpace(name)) | |
| break; | |
| stream.Seek(24, SeekOrigin.Current); | |
| stream.Read(buffer, 0, 12); | |
| var size = Convert.ToInt64(Encoding.UTF8.GetString(buffer, 0, 12).Trim('\0').Trim(), 8); | |
| stream.Seek(376L, SeekOrigin.Current); | |
| var output = Path.Combine(outputDir, name); | |
| if (!Directory.Exists(Path.GetDirectoryName(output))) | |
| Directory.CreateDirectory(Path.GetDirectoryName(output)); | |
| if (!name.Equals("./", StringComparison.InvariantCulture)) | |
| { | |
| using (var str = File.Open(output, FileMode.OpenOrCreate, FileAccess.Write)) | |
| { | |
| var buf = new byte[size]; | |
| stream.Read(buf, 0, buf.Length); | |
| str.Write(buf, 0, buf.Length); | |
| } | |
| } | |
| var pos = stream.Position; | |
| var offset = 512 - (pos % 512); | |
| if (offset == 512) | |
| offset = 0; | |
| stream.Seek(offset, SeekOrigin.Current); | |
| } | |
| } | |
| } | |
| } | |
| /* | |
| This software is available under 2 licenses -- choose whichever you prefer. | |
| ------------------------------------------------------------------------------ | |
| ALTERNATIVE A - MIT License | |
| Copyright (c) 2017 Sean Barrett | |
| Permission is hereby granted, free of charge, to any person obtaining a copy of | |
| this software and associated documentation files (the "Software"), to deal in | |
| the Software without restriction, including without limitation the rights to | |
| use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies | |
| of the Software, and to permit persons to whom the Software is furnished to do | |
| so, subject to the following conditions: | |
| The above copyright notice and this permission notice shall be included in all | |
| copies or substantial portions of the Software. | |
| THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | |
| IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | |
| FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | |
| AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | |
| LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, | |
| OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | |
| SOFTWARE. | |
| ------------------------------------------------------------------------------ | |
| ALTERNATIVE B - Public Domain (www.unlicense.org) | |
| This is free and unencumbered software released into the public domain. | |
| Anyone is free to copy, modify, publish, use, compile, sell, or distribute this | |
| software, either in source code form or as a compiled binary, for any purpose, | |
| commercial or non-commercial, and by any means. | |
| In jurisdictions that recognize copyright laws, the author or authors of this | |
| software dedicate any and all copyright interest in the software to the public | |
| domain. We make this dedication for the benefit of the public at large and to | |
| the detriment of our heirs and successors. We intend this dedication to be an | |
| overt act of relinquishment in perpetuity of all present and future rights to | |
| this software under copyright law. | |
| THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | |
| IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | |
| FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | |
| AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN | |
| ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION | |
| WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. | |
| */ |
Do you also know how to compress to a tar.gz? I've been trying to reverse your process to do so but have had no luck.
@Mpprobst
This is a very quick and dirty method of extraction, simply using offsets defined by the spec to grab the few pieces of information needed to extract the file's data from the stream. A full and proper implementation would define some structs, checksums, etc.
The TAR spec is actually fairly simple, it essentially just copies all of the input into a single stream, prepending each input with a basic header that it ensures begins on a specific byte boundary in the stream. In order for any other tool to be able to read the output, your would need to do a proper implementation and ensure the entire header is valid.
Luckily, the compression aspect of it using a GZip stream is already built-in to .NET, so it won't require anything complicated to convert a tar archive into a tar.gz one.
getting exception in line 40 when trying to decompress large files of 100MB
40 memStr.Write(buffer, 0, read);
issue fixed https://gist.github.com/Su-s/438be493ae692318c73e30367cbc5c2a
ExtractTar use memory blocks for reading and writing. My tests showed 45% performance.
const int chunk = 2 * 1024 * 1024; //2MB var fbuf = new byte[chunk];
using (var str = File.Open(output, FileMode.OpenOrCreate, FileAccess.Write)) { int fbalance = size; int fread, fcount; while (true) { fcount = (fbuf.Length <= fbalance) ? fbuf.Length : fbalance; fread = stream.Read(fbuf, 0, fcount); if (fread <= 0) break; str.Write(fbuf, 0, fread); fbalance -= fread; } }
Excellent, just what I was looking for. I have wrapped this into a PowerShell function:
https://github.com/TheDotSource/Expand-TarBall/blob/main/Expand-TarBall.ps1
If you are using .NET 6 you should replace:
do
{
read = gzip.Read(buffer, 0, chunk);
memStr.Write(buffer, 0, read);
} while (read == chunk);
with the following code:
while ((read = gzip.Read(buffer, 0, buffer.Length)) > 0)
{
memStr.Write(buffer, 0, read);
}
This small code changes avoids writing a memory stream with only a few bytes in it and an endless loop in the ExtractTar method
Thanks for the inital snipped it really works great without using any dependencies
What license would you put this code under?
What license would you put this code under?
@voltagex I added choice of MIT or Public Domain, whichever you prefer. If neither of them suit your needs, I would have no problem with any exceptions. Free free to use/modify/sell the code as you see fit in any open/closed source project, commercial or otherwise, I don't require any credit.
If you are using .NET 6 you should replace:
do { read = gzip.Read(buffer, 0, chunk); memStr.Write(buffer, 0, read); } while (read == chunk);with the following code:
while ((read = gzip.Read(buffer, 0, buffer.Length)) > 0) { memStr.Write(buffer, 0, read); }This small code changes avoids writing a memory stream with only a few bytes in it and an endless loop in the ExtractTar method Thanks for the inital snipped it really works great without using any dependencies
Thanks for this comment! This resolved my issue which proved very hard to debug.
I forked the snippet and made some style refactos and "modernizations" along with the fixes mentioned in the comments. This version works great on .NET 6
need this class i.e. GZipStream.cs which they used inside this file
need this class i.e. GZipStream.cs which they used inside this file
It's System.IO.Compression.GZipStream. If you don't have it, you should update your .net version.
System.IO.Compression.GZipStream It is one of the features that is introduced in .NET 7 https://devblogs.microsoft.com/dotnet/announcing-dotnet-7/ The above example should work with .NET 6 if you replace this: https://gist.github.com/ForeverZer0/a2cd292bd2f3b5e114956c00bb6e872b?permalink_comment_id=4071264#gistcomment-4071264 and with older versions (.NET Core 3) without that replacement
I found an edge case where this code crashes. We encountered a case where the name in the 512 byte header of a file was longer than 100 characters (bytes). These cases were handled for us by the use of a "fake file" called ././@LongLink with a typeTag of 'L'. This fake file's data contains the full filename that is larger than 100 bytes. So we need to read this first and temporarily hold onto it as we are about to read the ACTUAL real file in the next iteration. The real file's name is truncated to 100 characters but we will just skip those and use the full name we just read last iteration instead. This solved our problems. I updated my own forked gist with the fix here: https://gist.github.com/Matheos96/da8990030dfe3e27b0a48722042d9c0b
Thanks for the code!
I had trouble with the files here and it turns out there's an extra string in the file names:
Hex View 00 01 02 03 04 05 06 07 08 09 0A 0B 0C 0D 0E 0F
00000000 70 79 74 68 6F 6E 2F 44 4C 4C 73 2F 5F 61 73 79 python/DLLs/_asy
00000010 6E 63 69 6F 2E 70 79 64 00 63 69 6F 2E 70 79 64 ncio.pyd.cio.pyd
00000020 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................
00000030 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................
I'm not familiar with the tar spec but it seems coherent to terminate the string at the first null (to replace line 71, I also took the opportunity to use UTF8 instead of ASCII):
int termIndex = Array.IndexOf(buffer, (byte)'\0');
var name = Encoding.UTF8.GetString(buffer, 0, termIndex >= 0 ? termIndex : buffer.Length);There are other files in the tar.gz I linked that cause problems and it seems to be because a file prefix is missing (0x0208F359-0x0208F3F3):
Hex View 00 01 02 03 04 05 06 07 08 09 0A 0B 0C 0D 0E 0F
0208F200 6D 65 74 61 64 61 74 61 5F 65 64 69 74 61 62 6C metadata_editabl
0208F210 65 2E 63 70 79 74 68 6F 6E 2D 33 31 33 2E 70 79 e.cpython-313.py
0208F220 63 00 69 70 2F 5F 69 6E 74 65 72 6E 61 6C 2F 6F c.ip/_internal/o
0208F230 70 65 72 61 74 69 6F 6E 73 2F 62 75 69 6C 64 2F perations/build/
0208F240 5F 5F 70 79 63 61 63 68 65 5F 5F 2F 6D 65 74 61 __pycache__/meta
0208F250 64 61 74 61 5F 65 64 69 74 61 62 6C 65 2E 63 70 data_editable.cp
0208F260 79 74 68 6F 30 30 30 30 36 36 36 00 30 30 30 30 ytho0000666.0000
0208F270 30 30 30 00 30 30 30 30 30 30 30 00 30 30 30 30 000.0000000.0000
0208F280 30 30 30 33 36 30 37 00 31 34 35 34 34 34 30 30 0003607.14544400
0208F290 32 30 30 00 30 30 34 37 35 37 35 00 30 00 00 00 200.0047575.0...
0208F2A0 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................
0208F2B0 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................
0208F2C0 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................
0208F2D0 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................
0208F2E0 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................
0208F2F0 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................
0208F300 00 75 73 74 61 72 00 30 30 72 6F 6F 74 00 00 00 .ustar.00root...
0208F310 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................
0208F320 00 00 00 00 00 00 00 00 00 72 6F 6F 74 00 00 00 .........root...
0208F330 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................
0208F340 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................
0208F350 00 00 00 00 00 00 00 00 00 70 79 74 68 6F 6E 2F .........python/
0208F360 4C 69 62 2F 73 69 74 65 2D 70 61 63 6B 61 67 65 Lib/site-package
0208F370 73 2F 70 69 70 2F 5F 69 6E 74 65 72 6E 61 6C 2F s/pip/_internal/
0208F380 6F 70 65 72 61 74 69 6F 6E 73 2F 62 75 69 6C 64 operations/build
0208F390 2F 5F 5F 70 79 63 61 63 68 65 5F 5F 00 00 00 00 /__pycache__....
0208F3A0 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................
0208F3B0 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................
0208F3C0 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................
0208F3D0 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................
0208F3E0 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................
0208F3F0 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................
Here's my corrected version (uses @Matheos96's LongLink implementation but for net48):
https://gist.github.com/Unprex/7815aac5b3807cb08de0fcabbbde3613
See line 99-104 for the seconds fix.


Semicolon missing on end of line 76