[Python] Python Malware Crawler
2016.09.11 16:58
#!/usr/bin/python # Copyright (C) 2012 Ricardo Dias # # Malware Crawler Module v0.4 # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program. If not, see <http://www.gnu.org/licenses/>. # # Requirements: # - BeautifulSoup 3.0.8 from bs4 import BeautifulSoup import sys import hashlib import re import urllib2 import magic import os import socket import datetime # By default thug analyis is disabled isthug = False # variable for date value manipulation now = datetime.datetime.now() str(now) # maximum wait time of http gets timeout = 15 socket.setdefaulttimeout(timeout) # load thug function, also checks if thug is installed def loadthug(): try: sys.path.append('../thug/src') import thug isthug = True print "- Thug module loaded for html analysis" except ImportError: print "- No Thug module found, html code inspection won't be available" # determine file type for correct archival def gettype(file_): # ms = magic.open(magic.MAGIC_NONE) ms = magic.from_buffer(file_) # ms.load() # return ms.buffer(file) return ms # beautifulsoup parser def parse(url): request = urllib2.Request(url) request.add_header('User-Agent', 'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1)') try: http = BeautifulSoup(urllib2.urlopen(request), "lxml") except: print "- Error parsing %s" % (url) return return http def decisor(site, url): if not re.match('http', url): url = 'http://' + url try: url_dl = urllib2.urlopen(url).read() except Exception, e: # print "-- Error: %s" % e return filetype = gettype(url_dl).split(' ')[0] md5 = hashlib.md5(url_dl).hexdigest() if (filetype == 'HTML'): if isthug: print "-- Thug candidate: HTML code in %s" % url try: thug.Thug([url])() except Exception, e: print "- Thug error: %s" % e return else: dest = './malware/' + filetype fpath = dest + '/' + str(md5) if not os.path.exists(dest): os.makedirs(dest) if not os.path.exists(fpath): file = open(fpath, 'wb') file.write(url_dl) file.close print "-- "+ site + "Saved file type %s with md5: %s" % (filetype, md5) def malwaredl(soup): print "- Fetching from Malware Domain List" mdl = [] for row in soup('description'): mdl.append(row) del mdl[0] mdl_sites = [] for row in mdl: site = re.sub('&', '&', str(row).split()[1]).replace(',', '') if site == '-': mdl_sites.append(re.sub('&', '&', str(row).split()[4]).replace(',', '')) else: mdl_sites.append(site) print "-- Found %s urls" % len(mdl) for row in mdl_sites: decisor("malwaredl", row) def vxvault(soup): print "- Fetching from VXVault" vxv = [] for row in soup('pre'): vxv = row.string.split('\r\n') del vxv[:4] del vxv[-1] print "-- Found %s urls" % len(vxv) for row in vxv: decisor("vxvault", row) def malc0de(soup): print "- Fetching from Malc0de" mlc = [] for row in soup('description'): mlc.append(row) del mlc[0] mlc_sites = [] for row in mlc: site = re.sub('&', '&', str(row).split()[1]).replace(',', '') mlc_sites.append(site) print "-- Found %s urls" % len(mlc_sites) for row in mlc_sites: decisor("malc0de", row) def malwarebl(soup): print "- Fetching from Malware Black List" mbl = [] for row in soup('description'): site = str(row).split()[1].replace(',', '') mbl.append(site) print "-- Found %s urls" % len(mbl) for row in mbl: decisor("malwarebl", row) def minotaur(soup): print "- Fetching from NovCon Minotaur" min = [] # for row in soup('td'): # try: # if re.match('http',row.string): # min.append(row.string) # except: # pass minota_body = soup.find("div", {"id": "mtabs-2"}) parse_tr = minota_body.findAll("tr") for row in parse_tr: list_td = row.findAll("td") if len(list_td) == 0: continue try: min.append(list_td[3].text) except: pass print "-- Found %s urls" % len(min) for row in min: decisor("minotaur", row) def sacour(soup): print "- Fetching from Sacour.cn" for url in soup('a'): min = [] if re.match('list/', url['href']): suburl = parse('http://www.sacour.cn/' + url['href']) for text in suburl('body'): for urls in text.contents: if re.match('http://', str(urls)): min.append(str(urls)) if len(min) > 0: print "-- Found %s urls in %s" % (len(min), url['href']) for row in min: decisor("sacour", row) if __name__ == "__main__": print "Malware Parser v0.4" try: if sys.argv[1] == '-t': loadthug() except: print "- Thug analysis not enabled (use -t to enable thug)" # source list minotaur(parse('http://minotauranalysis.com')) malwaredl(parse('http://www.malwaredomainlist.com/hostslist/mdl.xml')) vxvault(parse('http://vxvault.siri-urz.net/URL_List.php')) malc0de(parse('http://malc0de.com/rss')) malwarebl(parse('http://www.malwareblacklist.com/mbl.xml')) # sacour(parse('http://www.sacour.cn/showmal.asp?month=%d&year=%d' % (now.month, now.year))) #site die
https://github.com/0day1day/mwcrawler을 수정 magic 라이브러리 오류와 minotaur 수집가능하게 변경
-
Useless Websites
2020.02.04 12:38
Useless Websites The 20 Most Useless Websites on the Internet
댓글 0
번호 | 제목 | 글쓴이 | 날짜 | 조회 수 |
---|---|---|---|---|
16 | [Python] 데이터 분류 좋은 예 | Leekyu | 2017.05.11 | 8617 |
15 | [Python] 객체를 파일로 저장하는 pickle | Leekyu | 2017.02.08 | 3018 |
14 | [Python] py2exe | Leekyu | 2017.02.02 | 826 |
13 | [Python] simple ip range | Leekyu | 2016.12.19 | 430 |
12 | [Go] Web Shell Finder [작성중] | Leekyu | 2016.11.09 | 517 |
» | [Python] Python Malware Crawler [1] | Leekyu | 2016.09.11 | 3399 |
10 | [Go] Memory address sample | Leekyu | 2016.09.06 | 781 |
9 | [Python] Python 파일업로드 예시 [1] | Leekyu | 2016.08.27 | 1803 |
8 | [Python] Python Request를 이용한 세션유지 | Leekyu | 2016.08.25 | 7912 |
7 | [Python] Python을 사용한 간단한 트래픽 생성 | Leekyu | 2016.08.18 | 865 |
6 | [웹] ajax & php를 사용한 시계 예제 | Leekyu | 2016.01.24 | 1402 |
5 | [C] 함수 포인터에 설명이 잘되있는 블로그 | Leekyu | 2014.11.18 | 2645 |
4 | [Python] Python을 이용한 Tor 접속 | Leekyu | 2014.11.15 | 8176 |
3 | [Python] Web page Screenshot | Leekyu | 2014.11.15 | 11814 |
2 | 콜백함수 | Leekyu | 2014.02.09 | 889 |
1 | LoadLibray 함수 | Leekyu | 2014.01.27 | 684 |