메뉴 건너뛰기

Hello :0

import pymongo
import numpy as np
from sklearn.feature_extraction import DictVectorizer
from sklearn import preprocessing

connection = pymongo.MongoClient("mongodb://127.0.0.1")
db = connection.malware_zoo
learning_data_all = db.learning_data_all

zoo_data = learning_data_all.find()


process_info_list=[]
r_label = []
detect_and_num ={}

ransom_list=[]
for line in zoo_data:
    vender_key= line["detect"].keys()
    if 'Invincea' in line["detect"].keys(): #Invincea는 탐지내용이 애매(?) 한거 같아서 일단 제외
        del line["detect"]['Invincea']
        vender_key.remove('Invincea');
    for vender in vender_key: #랜섬웨어 진단명 자주 사용되는 단어들을 골라낸다.
        if  "locky" in (str(line["detect"][vender]["result"])).lower() :
            ransom_list.append(line["md5"])
        elif "zepto" in (str(line["detect"][vender]["result"])).lower() :
            ransom_list.append(line["md5"])
        elif "ransom" in (str(line["detect"][vender]["result"])).lower():
            ransom_list.append(line["md5"])  
        elif "tesla" in (str(line["detect"][vender]["result"])).lower():
            ransom_list.append(line["md5"])
        elif "ceber" in (str(line["detect"][vender]["result"])).lower():
            ransom_list.append(line["md5"]) 



ransom_list = list(set(ransom_list))
print "suspicious ransom sample...",len(ransom_list)

zoo_data = learning_data_all.find()
for line in zoo_data:
    api_list = list(set(line["ie_api"]))
    for api in api_list:
        line["opcode"][api]=1
    line["opcode"].update(line["section_info"])
    process_info_list.append(line["opcode"])
    
    if line["md5"] in ransom_list:
        r_label.append([0,1])
    else:
        r_label.append([1,0])



r_label = np.array(r_label)  

v = DictVectorizer(sparse=False)
X = v.fit_transform(process_info_list)

X_normalized = preprocessing.normalize(X)  
print "X_normalized", len(X_normalized[0])

 - 악성코드 샘플들의 데이터를 뽑아와서 랜섬웨어 진단명을 가진것들만 리스트를 따로 만듬(진단명에 ransom, locky zepto 등등)

 - db 저장된 내용을 토대로 벡터화 그리고 정규화 시킨다.

 

#차원축소
from sklearn import decomposition

pca = decomposition.PCA(n_components=500)
pca.fit(X_normalized)

#실제사용데이터
x_vector = pca.transform(X_normalized)
x_vector = X_normalized

x_vector = X_normalized
print x_vector[0]

print len(r_label)
print r_label
 

 - 차원이 높아서 램이 부족해지기 때문에 줄인다. 500~4000 정도 해도 결과는 크게 안변하는듯

 

import tensorflow as tf

x_data = x_vector
y_data = r_label

# Randomly shuffle data
np.random.seed(100)
shuffle_indices = np.random.permutation(np.arange(len(y_data)))
x_shuffled = x_data[shuffle_indices]
y_shuffled = y_data[shuffle_indices]

# Split train/test set
# TODO: This is very crude, should use cross-validation
cut = int(len(x_shuffled) * 0.98)

x_train, x_dev = x_shuffled[:cut], x_shuffled[cut:]
y_train, y_dev = y_shuffled[:cut], y_shuffled[cut:]

# parameters
learning_rate = 0.0001
training_epochs = 2000
batch_size = 100
display_step =1

#tf graph input
X = tf.placeholder("float", [None, len(x_train[0])])
Y = tf.placeholder("float", [None, len(y_train[0])])

#store layers weight & bias
W1 = tf.Variable(tf.random_normal([len(x_train[0]), 500]))
W2 = tf.Variable(tf.random_normal([500, 500]))
W3 = tf.Variable(tf.random_normal([500, 500]))
W4 = tf.Variable(tf.random_normal([500, 500]))
W5 = tf.Variable(tf.random_normal([500, 500]))
W6 = tf.Variable(tf.random_normal([500, 500]))
W7 = tf.Variable(tf.random_normal([500, 500]))
W8 = tf.Variable(tf.random_normal([500, 100]))
W9 = tf.Variable(tf.random_normal([100, len(y_data[0])]))

B1 = tf.Variable(tf.random_normal([500]))
B2 = tf.Variable(tf.random_normal([500]))
B3 = tf.Variable(tf.random_normal([500]))
B4 = tf.Variable(tf.random_normal([500]))
B5 = tf.Variable(tf.random_normal([500]))
B6 = tf.Variable(tf.random_normal([500]))
B7 = tf.Variable(tf.random_normal([500]))
B8 = tf.Variable(tf.random_normal([100]))
B9 = tf.Variable(tf.random_normal([len(y_data[0])]))

#Construct model

L1 = tf.nn.relu(tf.add(tf.matmul(X, W1), B1))
L2 = tf.nn.relu(tf.add(tf.matmul(L1, W2), B2))
L3 = tf.nn.relu(tf.add(tf.matmul(L2, W3), B3))
L4 = tf.nn.relu(tf.add(tf.matmul(L3, W4), B4))
L5 = tf.nn.relu(tf.add(tf.matmul(L4, W5), B5))
L6 = tf.nn.relu(tf.add(tf.matmul(L5, W6), B6))
L7 = tf.nn.relu(tf.add(tf.matmul(L6, W7), B7))
L8 = tf.nn.relu(tf.add(tf.matmul(L7, W8), B8))
hypothesis = tf.add(tf.matmul(L8, W9), B9)

#Define loss and optimizer
cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(hypothesis, Y))
optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(cost)

#initializing the variables
init = tf.initialize_all_variables()

            
#launch the graph
print "Start Trainging"
with tf.Session() as sess:
    sess.run(init)
    #training cycle
    for epoch in range(training_epochs):
        avg_cost=0.
        total_batch = int(len(x_train)/batch_size)  #배치 개수 결정 전체 샘플 / 배치 싸이즈
        #Loop over all batches
        batch_start = 0
        batch_end = (batch_size)
        for i in range(total_batch):
            #0~99
            #100~199
            #200~299
            batch_xs = x_train[batch_start : batch_end]
            batch_ys = y_train[batch_start : batch_end]
            #batch_xs, batch_ys = mnist.train.next_batch(batch_size)
            #Fit training using batchdata
            sess.run(optimizer, feed_dict={X: batch_xs, Y: batch_ys})
            #Compute average loss
            avg_cost += sess.run(cost, feed_dict={X: batch_xs, Y: batch_ys})/total_batch
            
            batch_start = batch_end
            batch_end = batch_end + batch_size
            
        #Display logs per epoch step
        if epoch % display_step ==0:
            print "Epoch:", '%04d' %(epoch+1), "cost=", "{:.9f}".format(avg_cost)
            correct_prediction = tf.equal(tf.argmax(hypothesis, 1), tf.argmax(Y, 1))
            accuracy = tf.reduce_mean(tf.cast(correct_prediction, "float"))
            print "Test Accuracy:", accuracy.eval({X: batch_xs, Y: batch_ys}) #dropout use
            print "Predict Accuracy:", accuracy.eval({X: x_dev, Y: y_dev}) #dropout use
            print ""
        
    
    print "Optimization Finished!"
    
    # Test model
    correct_prediction = tf.equal(tf.argmax(hypothesis, 1), tf.argmax(Y, 1))
    #Calculate accuracy
    accuracy = tf.reduce_mean(tf.cast(correct_prediction, "float"))
    print "Accuracy:", accuracy.eval({X: x_dev, Y: y_dev})

 - 머신러닝 모델을 만들고 고고

 

1.png

 - 전체 악성코드 중에서 랜섬웨어 분류를 했을떄 97%의 정확도 까지 끌어 올렸다.

 

참고 사이트

https://hunkim.github.io/ml/lab10.pdf

https://github.com/FuZer/Study_TensorFlow