Python之基于十六进制判断文件类型

Posted 小粉优化大师

tags:

篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了Python之基于十六进制判断文件类型相关的知识,希望对你有一定的参考价值。

核心代码:

#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Author  : suk
import struct
from io import BytesIO


# 支持文件类型
# 用16进制字符串的目的是可以知道文件头是多少字节
# 各种文件头的长度不一样,少则2字符,长则8字符
def typeList(types):
    type_dict = {\'jpg\': [\'FFD8FFE000104A464946\'],
                 \'png\': [\'89504E470D0A1A0A0000\'],
                 \'gif\': [\'47494638396126026F01\'],
                 \'tif\': [\'49492A00227105008037\'],
                 \'bmp\': [\'424D8E1B030000000000\'],
                 \'dwg\': [\'41433130313500000000\'],
                 \'html\': [\'3C21444F435459504520\'],
                 \'htm\': [\'3C21646F637479706520\'],
                 \'css\': [\'48544D4C207B0D0A0942\'],
                 \'js\': [\'696B2E71623D696B2E71\'],
                 \'rtf\': [\'7B5C727466315C616E73\'],
                 \'psd\': [\'38425053000100000000\'],
                 \'eml\': [\'46726F6D3A203D3F6762\'],
                 \'wps\': [\'D0CF11E0A1B11AE10000\'],
                 \'mdb\': [\'5374616E64617264204A\'],
                 \'ps\': \'[252150532D41646F6265]\',
                 \'pdf\': [\'255044462D312E\'],
                 \'rmvb\': [\'2E524D46000000120001\'],
                 \'flv\': [\'464C5601050000000900\'],
                 \'mp4\': [\'00000020667479706D70\'],
                 \'mp3\': [\'49443303000000002176\'],
                 \'mpg\': [\'000001BA210001000180\'],
                 \'wmv\': [\'3026B2758E66CF11A6D9\'],
                 \'wav\': [\'52494646E27807005741\'],
                 \'avi\': [\'52494646D07D60074156\'],
                 \'mid\': [\'4D546864000000060001\'],
                 \'zip\': [\'504B0304140000000800\', \'504B0304140000080800\', \'504B03040A0000080000\'],
                 \'rar\': [\'526172211A0700CF9073\'],
                 \'ini\': [\'235468697320636F6E66\'],
                 \'jar\': [\'504B03040A0000000000\'],
                 \'exe\': [\'4D5A9000030000000400\'],
                 \'jsp\': [\'3C25402070616765206C\'],
                 \'mf\': [\'4D616E69666573742D56\'],
                 \'xml\': [\'3C3F786D6C2076657273\'],
                 \'sql\': [\'494E5345525420494E54\'],
                 \'java\': [\'7061636B616765207765\'],
                 \'bat\': [\'406563686F206F66660D\'],
                 \'gz\': [\'1F8B0800000000000000\'],
                 \'properties\': [\'6C6F67346A2E726F6F74\'],
                 \'class\': [\'CAFEBABE0000002E0041\'],
                 \'chm\': [\'49545346030000006000\'],
                 \'mxp\': [\'04000000010000001300\'],
                 \'docx\': [\'504B0304140006000800\', \'504B03040A0000000000\'],
                 \'torrent\': [\'6431303A637265617465\'],
                 \'mov\': [\'6D6F6F76\'],
                 \'wpd\': [\'FF575043\'],
                 \'dbx\': [\'CFAD12FEC5FD746F\'],
                 \'pst\': [\'2142444E\'],
                 \'qdf\': [\'AC9EBD8F\'],
                 \'pwl\': [\'E3828596\'],
                 \'ram\': [\'2E7261FD\']
                 }
    ret = {}
    for k_hex, v_prefix in type_dict.items():
        if k_hex in types:
            ret[k_hex] = v_prefix
    return ret


# 字节码转16进制字符串
def bytes2hex(bytes):
    num = len(bytes)
    hexstr = u""
    for i in range(num):
        t = u"%x" % bytes[i]
        if len(t) % 2:
            hexstr += u"0"
        hexstr += t
    return hexstr.upper()


# 获取文件类型
def file_type(filename):
    binfile = open(filename, \'rb\')  # 必需二制字读取
    tl = typeList(types=["jpg", "zip", "docx"])
    ftype = None
    for type_name, hcode_list in tl.items():
        flag = False
        for hcode in hcode_list:
            numOfBytes = int(len(hcode) / 2)  # 需要读多少字节
            binfile.seek(0)  # 每次读取都要回到文件头,不然会一直往后读取
            hbytes = struct.unpack_from("B" * numOfBytes, binfile.read(numOfBytes))  # 一个 "B"表示一个字节
            f_hcode = bytes2hex(hbytes)  # 如果判断不出来,打印出这个值,往字典增加即可
            # print("上传数据流hex", s_hcode, \'=\', "代码字典hex", hcode)  # 如果判断不出来,打印出这个值,往字典增加即可
            if f_hcode == hcode:
                flag = True
                break
        if flag:
            ftype = type_name
            break
    binfile.close()
    return ftype


# 获取字节流类型
def stream_type(stream, types):
    """
    :param stream:流数据
    :param types:需要判断文件类型,格式:["jpg","jpn"]
    :return:
    """
    tl = typeList(types=types)
    ftype = None
    for type_name, hcode_list in tl.items():
        flag = False
        for hcode in hcode_list:
            numOfBytes = int(len(hcode) / 2)  # 需要读多少字节
            hbytes = struct.unpack_from("B" * numOfBytes, stream[0:numOfBytes])  # 一个 "B"表示一个字节
            s_hcode = bytes2hex(hbytes)
            # print("上传数据流hex", s_hcode, \'=\', "代码字典hex", hcode)  # 如果判断不出来,打印出这个值,往字典增加即可
            if s_hcode == hcode:
                flag = True
                break
        if flag:
            ftype = type_name
            break
    return ftype


def stream_split(stream, count=3):
    """
    主要处理流是分段获取的数据
    :param stream: 块流
    :param count: 取多少段合成来判断类型,默认三段
    :return:
    """
    block_stream = BytesIO()
    temp = 1
    for block in stream:
        block_stream.write(block)
        if temp == count:
            break
        temp += 1
    return block_stream.getvalue()
is_file_type.py
type_dict字典,根据自己上传的文件,来填写,数据来自互联网。

 

基于Flask的上传示例

@index.route(\'/upload\', methods=[\'GET\', \'POST\'])
def upload():

    if request.method == \'GET\':
        return render_template(\'upload.html\')

    upload_obj = request.files.get(\'code_file\')

    if not upload_obj:
        return \'没有选择文件上传\'

    ret = stream_type(stream_split(upload_obj.stream), ["jpg", "png", "pdf"])

    if not ret:
        return \'上传失败,文件类型不匹配,类型必须 "jpg" or "png" or "pdf"\'

    file_name = upload_obj.filename
    upload_obj.save(os.path.join(\'files\', file_name))

    return \'上传文件成功\'

upload.html

{% extends \'layout.html\' %}
{% block content %}
<h1>上传代码</h1>
<form action="" method="post" enctype="multipart/form-data">
    <input type="file" name="code_file">
    <input type="submit" value="上传"></input>
</form>
{% endblock %}

 

开始上传文件:

上传不在列表中的文件类型

 

 


上传在列表中的文件类型

 

 

以上是关于Python之基于十六进制判断文件类型的主要内容,如果未能解决你的问题,请参考以下文章

python之模块和包

如何用c语言判断一个未知文件的文件类型

初识python(条件判断循环控制循环次数限制常用数据类型字符串格式化列表常用操作二进制运算嵌套循环)

python 数据类型 之 数字类型

学习 Python 之 数据类型

Python学习第2篇:Python之数据类型