#!/usr/bin/env python # Copyright (c) 2014, Sippey Fun Lab # All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions are met: # # * Redistributions of source code must retain the above copyright # notice, this list of conditions and the following disclaimer. # # * Redistributions in binary form must reproduce the above copyright # notice, this list of conditions and the following disclaimer in the # documentation and/or other materials provided with the distribution. # # * Neither the name of the Sippey Fun Lab nor the # names of its contributors may be used to endorse or promote products # derived from this software without specific prior written permission. # # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND # ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED # WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE # DISCLAIMED. IN NO EVENT SHALL COPYRIGHT HOLDER BE LIABLE FOR ANY # DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES # (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; # LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND # ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS # SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. # # # Zippey: A Git filter for friendly handling of ZIP-based files # # There are many types of ZIP-based files, such as Microsoft Office .docx, # .xlsx, .pptx files, OpenOffice .odt files and jar files, that contains # plaintext content but not really tractable by git due to compression smears # parts that have been modified and parts that remain the same across commit. # This prevent Git from versioning these files and treat them as a new binary # blob every time the file is saved. # # Zippey is a Git filter that un-zip zip-based file into a simple text format # during git add/commit ("clean" process) and recover the original zip-based # file after git checkout ("smudge" process). Since diff is taken on the # "cleaned" file after file is added, it is likely real changes to file can be # reflected by original git diff command. # # The text format is defined as a series of records. Each records represent a # file in the original zip-based file, which is composed of two parts, # a header that contains meta file and a body that contains data. The header # is a few data fields segmented by pipe character like this: # # length|raw_length|type|filename # # where length is an ascii coded integer of the following data section, raw_length # is the orginal length of data (if transformation is taken), type can be A for # text data or B for binary data, and filename is the original file name # including path if the zip-based file contains directories. Immediately after # the header, there is a carriage return ('\n'), follows "length" byte of # data, and then another CR and then the next recor, i,e, # # [header1]\n[data1]\n[header2]\n[data2] ... # # There are two types of data section. If the file contains only text data, # its content is copied to data section without any change, otherwise, data # is base64 coded to ensure the entire file is text format. # # # Author: Sippey (sippey@gmail.com) # Date: Apr.18, 2014 # # Modified by Kristian Hoey Horsberg # to make python 3 compatible # Date May 20th 2014 # import zipfile import sys import io import base64 import string import tempfile import os.path DEBUG_ZIPPEY = False NAME = 'Zippey' ENCODING = 'UTF-8' def debug(msg): '''Print debug message''' if DEBUG_ZIPPEY: sys.stderr.write('{0}: debug: {1}\n'.format(NAME, msg)) def error(msg): '''Print error message''' sys.stderr.write('{0}: error: {1}\n'.format(NAME, msg)) def init(): '''Initialize writing; set binary mode for windows''' debug("Running on {}".format(sys.platform)) if sys.platform.startswith('win'): import msvcrt debug("Enable Windows binary workaround") msvcrt.setmode(sys.stdin.fileno(), os.O_BINARY) msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY) def encode(input, output): '''Encode into special VCS friendly format from input to output''' debug("ENCODE was called") tfp = tempfile.TemporaryFile(mode='w+b') tfp.write(input.read()) zfp = zipfile.ZipFile(tfp, "r") for name in zfp.namelist(): data = zfp.read(name) text_extentions = ['.txt', '.html', '.xml'] extention = os.path.splitext(name)[1][1:].strip().lower() try: # Check if text data data.decode(ENCODING) try: strdata = map(chr, data) except TypeError: strdata = data if extention not in text_extentions and not all(c in string.printable for c in strdata): raise UnicodeDecodeError(ENCODING, "".encode(ENCODING), 0, 1, "Artificial exception") # Encode debug("Appending text file '{}'".format(name)) output.write("{}|{}|A|{}\n".format(len(data), len(data), name).encode(ENCODING)) output.write(data) output.write("\n".encode(ENCODING)) # Separation from next meta line except UnicodeDecodeError: # Binary data debug("Appending binary file '{}'".format(name)) raw_len = len(data) data = base64.b64encode(data) output.write("{}|{}|B|{}\n".format(len(data), raw_len, name).encode(ENCODING)) output.write(data) output.write("\n".encode(ENCODING)) # Separation from next meta line zfp.close() tfp.close() def decode(input, output): '''Decode from special VCS friendly format from input to output''' debug("DECODE was called") tfp = tempfile.TemporaryFile(mode='w+b') zfp = zipfile.ZipFile(tfp, "w", zipfile.ZIP_DEFLATED) while True: meta = input.readline().decode(ENCODING) if not meta: break (data_len, raw_len, mode, name) = [t(s) for (t, s) in zip((int, int, str, str), meta.split('|'))] if mode == 'A': debug("Appending text file '{}'".format(name)) zfp.writestr(name.rstrip(), input.read(data_len)) input.read(1) # Skip last '\n' elif mode == 'B': debug("Appending binary file '{}'".format(name.rstrip())) zfp.writestr(name.rstrip(), base64.b64decode(input.read(data_len))) input.read(1) # Skip last '\n' else: # Should never reach here zfp.close() tfp.close() error('Illegal mode "{}"'.format(mode)) sys.exit(1) # Flush all writes zfp.close() # Write output tfp.seek(0) output.write(tfp.read()) tfp.close() def main(): '''Main program''' init() input = io.open(sys.stdin.fileno(), 'rb') output = io.open(sys.stdout.fileno(), 'wb') if len(sys.argv) < 2 or sys.argv[1] == '-' or sys.argv[1] == '--help': sys.stdout.write("{}\nTo encode: 'python zippey.py e'\nTo decode: 'python zippey.py d'\nAll files read from stdin and printed to stdout\n".format(NAME)) elif sys.argv[1] == 'e': encode(input, output) elif sys.argv[1] == 'd': decode(input, output) else: error("Illegal argument '{}'. Try --help for more information".format(sys.argv[1])) sys.exit(1) if __name__ == '__main__': main()