메뉴 건너뛰기

Hello :0

1편 내용에 이어서...

 

2.png

위와 같이 데이터를 쌓았다. 이제 본격적으로 머신러닝을 하기위한 데이터 셋을 만들 차례이다.

일단 테스트로 머신러닝에 사용할 피쳐는 샘플파일의 Opcode와 API, PE세션명 정도로 정하였다.

이런 PE파일의 정보를 얻기 위해 distorm3이라는 python win32/64 디스어셈블러를 사용했다.

정보를 추출하기 위한 코드는 아래와 같다.

import pefile
import sys
import distorm3
from hashlib import sha1
from hashlib import sha256
from hashlib import sha512
from hashlib import md5
import pymongo
import re



section_characteristics = [
    ('IMAGE_SCN_TYPE_REG',                  0x00000000), # reserved
    ('IMAGE_SCN_TYPE_DSECT',                0x00000001), # reserved
    ('IMAGE_SCN_TYPE_NOLOAD',               0x00000002), # reserved
    ('IMAGE_SCN_TYPE_GROUP',                0x00000004), # reserved
    ('IMAGE_SCN_TYPE_NO_PAD',               0x00000008), # reserved
    ('IMAGE_SCN_TYPE_COPY',                 0x00000010), # reserved

    ('IMAGE_SCN_CNT_CODE',                  0x00000020),
    ('IMAGE_SCN_CNT_INITIALIZED_DATA',      0x00000040),
    ('IMAGE_SCN_CNT_UNINITIALIZED_DATA',    0x00000080),

    ('IMAGE_SCN_LNK_OTHER',                 0x00000100),
    ('IMAGE_SCN_LNK_INFO',                  0x00000200),
    ('IMAGE_SCN_LNK_OVER',                  0x00000400), # reserved
    ('IMAGE_SCN_LNK_REMOVE',                0x00000800),
    ('IMAGE_SCN_LNK_COMDAT',                0x00001000),

    ('IMAGE_SCN_MEM_PROTECTED',             0x00004000), # obsolete
    ('IMAGE_SCN_NO_DEFER_SPEC_EXC',         0x00004000),
    ('IMAGE_SCN_GPREL',                     0x00008000),
    ('IMAGE_SCN_MEM_FARDATA',               0x00008000),
    ('IMAGE_SCN_MEM_SYSHEAP',               0x00010000), # obsolete
    ('IMAGE_SCN_MEM_PURGEABLE',             0x00020000),
    ('IMAGE_SCN_MEM_16BIT',                 0x00020000),
    ('IMAGE_SCN_MEM_LOCKED',                0x00040000),
    ('IMAGE_SCN_MEM_PRELOAD',               0x00080000),

    ('IMAGE_SCN_ALIGN_1BYTES',              0x00100000),
    ('IMAGE_SCN_ALIGN_2BYTES',              0x00200000),
    ('IMAGE_SCN_ALIGN_4BYTES',              0x00300000),
    ('IMAGE_SCN_ALIGN_8BYTES',              0x00400000),
    ('IMAGE_SCN_ALIGN_16BYTES',             0x00500000), # default alignment
    ('IMAGE_SCN_ALIGN_32BYTES',             0x00600000),
    ('IMAGE_SCN_ALIGN_64BYTES',             0x00700000),
    ('IMAGE_SCN_ALIGN_128BYTES',            0x00800000),
    ('IMAGE_SCN_ALIGN_256BYTES',            0x00900000),
    ('IMAGE_SCN_ALIGN_512BYTES',            0x00A00000),
    ('IMAGE_SCN_ALIGN_1024BYTES',           0x00B00000),
    ('IMAGE_SCN_ALIGN_2048BYTES',           0x00C00000),
    ('IMAGE_SCN_ALIGN_4096BYTES',           0x00D00000),
    ('IMAGE_SCN_ALIGN_8192BYTES',           0x00E00000),
    ('IMAGE_SCN_ALIGN_MASK',                0x00F00000),

    ('IMAGE_SCN_LNK_NRELOC_OVFL',           0x01000000),
    ('IMAGE_SCN_MEM_DISCARDABLE',           0x02000000),
    ('IMAGE_SCN_MEM_NOT_CACHED',            0x04000000),
    ('IMAGE_SCN_MEM_NOT_PAGED',             0x08000000),
    ('IMAGE_SCN_MEM_SHARED',                0x10000000),
    ('IMAGE_SCN_MEM_EXECUTE',               0x20000000),
    ('IMAGE_SCN_MEM_READ',                  0x40000000),
    ('IMAGE_SCN_MEM_WRITE',                 0x80000000) ]

SECTION_CHARACTERISTICS = dict([(e[1], e[0]) for e in section_characteristics]+section_characteristics)


def retrieve_flags(flag_dict, flag_filter):
    """Read the flags from a dictionary and return them in a usable form.

    Will return a list of (flag, value) for all flags in "flag_dict"
    matching the filter "flag_filter".
    """

    return [(f[0], f[1]) for f in list(flag_dict.items()) if
            isinstance(f[0], (str, bytes)) and f[0].startswith(flag_filter)]

section_flags = retrieve_flags(SECTION_CHARACTERISTICS, 'IMAGE_SCN_')


def get_info(sample_path):

    pe = pefile.PE(sample_path)
    op_list_count = {}
    section_name ={}
    api_list = []

    for section in pe.sections:
        flags = []

        for flag in sorted(section_flags):
            if getattr(section, flag[0]):
                flags.append(flag[0])
        if 'IMAGE_SCN_MEM_EXECUTE' in flags:
            iterable = distorm3.DecodeGenerator(0, section.get_data(), distorm3.Decode32Bits)

            for (offset, size, instruction, hexdump) in iterable:
                #print("%.8x: %-32s %s" % (offset, hexdump, instruction.split(" ")[0]))
                op_code = instruction.split(" ")[0]
                if op_code not in op_list_count.keys():
                    op_list_count[op_code] = 1
                elif op_code in op_list_count.keys():
                    op_list_count[op_code] = op_list_count[op_code] +1

            for flag in sorted(section_flags):
                if getattr(section, flag[0]):
                    flags.append(flag[0])
        s_name = re.sub(r'[^\w]',"", section.Name)
        section_name[s_name] = section.get_entropy()


    pe.parse_data_directories()
    try:
        for entry in pe.DIRECTORY_ENTRY_IMPORT:
          for imp in entry.imports:
            api_list.append(imp.name)
    except:
        pass

    try:
        for exp in pe.DIRECTORY_ENTRY_EXPORT.symbols:
            api_list.append(exp.name)
    except:
        pass

    return section_name, op_list_count, api_list
 

opcode는 peview를보고 실행가능한 부분만의 영역을 디스어셈블 하도록 한다.

 

3.png

세션명, Opcode, API등의 정보를 알아내고

새로운 샘플이 확인 될때마다. 정보를 추출한 후 다시 mongodb에 저장을 한다.

 

3.ong.png

안랩 V3에서 진단하는 샘플만 저장한다. 며칠을 돌려보니 PUP진짜 많더라....