Zippy for managing container type files in repo.

This commit is contained in:
Chris Giacofei 2023-02-03 08:58:33 -05:00
parent cfdf09a93e
commit b4237ffbae

198
util/zippey.py Normal file
View File

@ -0,0 +1,198 @@
#!/usr/bin/env python
# Copyright (c) 2014, Sippey Fun Lab
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# * Redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer.
#
# * Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in the
# documentation and/or other materials provided with the distribution.
#
# * Neither the name of the Sippey Fun Lab nor the
# names of its contributors may be used to endorse or promote products
# derived from this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL COPYRIGHT HOLDER BE LIABLE FOR ANY
# DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
#
# Zippey: A Git filter for friendly handling of ZIP-based files
#
# There are many types of ZIP-based files, such as Microsoft Office .docx,
# .xlsx, .pptx files, OpenOffice .odt files and jar files, that contains
# plaintext content but not really tractable by git due to compression smears
# parts that have been modified and parts that remain the same across commit.
# This prevent Git from versioning these files and treat them as a new binary
# blob every time the file is saved.
#
# Zippey is a Git filter that un-zip zip-based file into a simple text format
# during git add/commit ("clean" process) and recover the original zip-based
# file after git checkout ("smudge" process). Since diff is taken on the
# "cleaned" file after file is added, it is likely real changes to file can be
# reflected by original git diff command.
#
# The text format is defined as a series of records. Each records represent a
# file in the original zip-based file, which is composed of two parts,
# a header that contains meta file and a body that contains data. The header
# is a few data fields segmented by pipe character like this:
#
# length|raw_length|type|filename
#
# where length is an ascii coded integer of the following data section, raw_length
# is the orginal length of data (if transformation is taken), type can be A for
# text data or B for binary data, and filename is the original file name
# including path if the zip-based file contains directories. Immediately after
# the header, there is a carriage return ('\n'), follows "length" byte of
# data, and then another CR and then the next recor, i,e,
#
# [header1]\n[data1]\n[header2]\n[data2] ...
#
# There are two types of data section. If the file contains only text data,
# its content is copied to data section without any change, otherwise, data
# is base64 coded to ensure the entire file is text format.
#
#
# Author: Sippey (sippey@gmail.com)
# Date: Apr.18, 2014
#
# Modified by Kristian Hoey Horsberg <khh1990 ' at ' gmail.com>
# to make python 3 compatible
# Date May 20th 2014
#
import zipfile
import sys
import io
import base64
import string
import tempfile
import os.path
DEBUG_ZIPPEY = False
NAME = 'Zippey'
ENCODING = 'UTF-8'
def debug(msg):
'''Print debug message'''
if DEBUG_ZIPPEY:
sys.stderr.write('{0}: debug: {1}\n'.format(NAME, msg))
def error(msg):
'''Print error message'''
sys.stderr.write('{0}: error: {1}\n'.format(NAME, msg))
def init():
'''Initialize writing; set binary mode for windows'''
debug("Running on {}".format(sys.platform))
if sys.platform.startswith('win'):
import msvcrt
debug("Enable Windows binary workaround")
msvcrt.setmode(sys.stdin.fileno(), os.O_BINARY)
msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
def encode(input, output):
'''Encode into special VCS friendly format from input to output'''
debug("ENCODE was called")
tfp = tempfile.TemporaryFile(mode='w+b')
tfp.write(input.read())
zfp = zipfile.ZipFile(tfp, "r")
for name in zfp.namelist():
data = zfp.read(name)
text_extentions = ['.txt', '.html', '.xml']
extention = os.path.splitext(name)[1][1:].strip().lower()
try:
# Check if text data
data.decode(ENCODING)
try:
strdata = map(chr, data)
except TypeError:
strdata = data
if extention not in text_extentions and not all(c in string.printable for c in strdata):
raise UnicodeDecodeError(ENCODING, "".encode(ENCODING), 0, 1, "Artificial exception")
# Encode
debug("Appending text file '{}'".format(name))
output.write("{}|{}|A|{}\n".format(len(data), len(data), name).encode(ENCODING))
output.write(data)
output.write("\n".encode(ENCODING)) # Separation from next meta line
except UnicodeDecodeError:
# Binary data
debug("Appending binary file '{}'".format(name))
raw_len = len(data)
data = base64.b64encode(data)
output.write("{}|{}|B|{}\n".format(len(data), raw_len, name).encode(ENCODING))
output.write(data)
output.write("\n".encode(ENCODING)) # Separation from next meta line
zfp.close()
tfp.close()
def decode(input, output):
'''Decode from special VCS friendly format from input to output'''
debug("DECODE was called")
tfp = tempfile.TemporaryFile(mode='w+b')
zfp = zipfile.ZipFile(tfp, "w", zipfile.ZIP_DEFLATED)
while True:
meta = input.readline().decode(ENCODING)
if not meta:
break
(data_len, raw_len, mode, name) = [t(s) for (t, s) in zip((int, int, str, str), meta.split('|'))]
if mode == 'A':
debug("Appending text file '{}'".format(name))
zfp.writestr(name.rstrip(), input.read(data_len))
input.read(1) # Skip last '\n'
elif mode == 'B':
debug("Appending binary file '{}'".format(name.rstrip()))
zfp.writestr(name.rstrip(), base64.b64decode(input.read(data_len)))
input.read(1) # Skip last '\n'
else:
# Should never reach here
zfp.close()
tfp.close()
error('Illegal mode "{}"'.format(mode))
sys.exit(1)
# Flush all writes
zfp.close()
# Write output
tfp.seek(0)
output.write(tfp.read())
tfp.close()
def main():
'''Main program'''
init()
input = io.open(sys.stdin.fileno(), 'rb')
output = io.open(sys.stdout.fileno(), 'wb')
if len(sys.argv) < 2 or sys.argv[1] == '-' or sys.argv[1] == '--help':
sys.stdout.write("{}\nTo encode: 'python zippey.py e'\nTo decode: 'python zippey.py d'\nAll files read from stdin and printed to stdout\n".format(NAME))
elif sys.argv[1] == 'e':
encode(input, output)
elif sys.argv[1] == 'd':
decode(input, output)
else:
error("Illegal argument '{}'. Try --help for more information".format(sys.argv[1]))
sys.exit(1)
if __name__ == '__main__':
main()