[prev in list] [next in list] [prev in thread] [next in thread] 

List:       nmap-dev
Subject:    [NSE] http-referer-checker.nse
From:       George Chatzisofroniou <sophron () latthi ! com>
Date:       2013-06-23 15:31:06
Message-ID: CACeRBz=Hqi-5P-BWouefUDBMTApBvyYhS8=38_ARGRTbyNxv3g () mail ! gmail ! com
[Download RAW message or body]

The attached script informs about cross-domain include of scripts.
Websites that include external javascript scripts are delegating part
of their security to third-party entities since that included code has
full client-side power and can do whatever it wants (like steal
document.cookie or send malicious AJAX requests). So, it's important
for developers to never include a javascript file from a domain they
don't trust.

To make this work properly, i had to make some improvements to
httpspider library (i'll make a seperate thread for these changes).
So, you also need to apply the attached patch.

The script is easy to use. There are no arguments, except those for
spidering from httpspider library.

To test it, try:

./nmap -p80 -n -Pn --script http-referer-checker.nse 83.212.115.76

(Feel free to test it on this server. It's a VM of mine i've set up
for Nmap research.)

The output should be something like:

PORT   STATE SERVICE REASON
80/tcp open  http    syn-ack
| http-referer-checker:
| Spidering limited to: maxpagecount=30
|   http://ajax.googleapis.com/ajax/libs/dojo/1.9.1/dojo/dojo.js
|   http://ajax.googleapis.com/ajax/libs/prototype/1.7.1.0/prototype.js
|   http://code.jquery.com/jquery-latest.js
|_  http://s7.addthis.com/js/300/addthis_widget.js#pubid=xa-511d06db78eb3b45

Any feedback is welcome,

--
George Chatzisofroniou
sophron.latthi.com

["http-referer-checker.nse" (application/octet-stream)]
["adding_blacklist_for_scraping.diff" (application/octet-stream)]

Index: nselib/httpspider.lua
===================================================================
--- nselib/httpspider.lua	(revision 31056)
+++ nselib/httpspider.lua	(working copy)
@@ -57,6 +57,22 @@
 --       HEAD instead of GET for files that do not have extensions indicating
 --       that they are webpages (the list of webpage extensions is located in
 --       nselib/data/http-web-files-extensions.lst)
+-- @args httpspider.blacklistforscraping the crawler won't scrape the 
+--       resources defined in this table. The table should contain 
+--       dictionaries with 'extension' and 'location' keys defined. For 
+--       example, if i set this argument to { { extension = "*", location = 
+--       "outsidehost"}, { extension = "js", location = "withinoroutsidehost"} 
+--       }, the crawler won't scrape any resources outside the target or any
+--       JS files within or outside the target host. The 'extension' key 
+--       may hold valid strings as extensions or the "*" symbol for all 
+--       extensions (hence, all URLs). The 'location' may contain three values: 
+--       'withinhost' for URLs that point inside the host target, 'outsidehost' 
+--       for URLs that point outside the host target and 'withinoroutsidehost' 
+--       for URLs that point anywhere. Please, note  that by using this argument, 
+--       the crawler will spider the defined URLs (but it won't scrape them). If
+--       you need to define a blacklist for spidering, please check the 
+--       'withinhost', 'withindomain', 'noblacklist' and 'maxdepth' arguments. 
+--       (default: {})
 --
 
 local coroutine = require "coroutine"
@@ -85,42 +101,6 @@
 		o.timeout  = options.timeout or 10000
 		o.whitelist = o.whitelist or {}
 		o.blacklist = o.blacklist or {}
-    local removewww = function(url) return string.gsub(url, "^www%.", "") end
-		
-		if ( o.withinhost == true or o.withindomain == true ) then
-			-- set up the appropriate matching functions
-			if ( o.withinhost ) then
-				o.withinhost = function(u)
-					local parsed_u = url.parse(tostring(u))
-													
-					if ( o.base_url:getPort() ~= 80 and o.base_url:getPort() ~= 443 ) then
-						if ( tonumber(parsed_u.port) ~= tonumber(o.base_url:getPort()) ) then
-							return false
-						end
-					elseif ( parsed_u.scheme ~= o.base_url:getProto() ) then
-						return false
-				  -- if urls don't match only on the "www" prefix, then they are probably the \
                same
-					elseif ( parsed_u.host == nil or removewww(parsed_u.host:lower()) ~= \
                removewww(o.base_url:getHost():lower()) ) then
-						return false
-					end
-					return true
-				end
-			else
-				o.withindomain = function(u)
-					local parsed_u = url.parse(tostring(u))				
-					if ( o.base_url:getPort() ~= 80 and o.base_url:getPort() ~= 443 ) then
-						if ( tonumber(parsed_u.port) ~= tonumber(o.base_url:getPort()) ) then
-							return false
-						end
-					elseif ( parsed_u.scheme ~= o.base_url:getProto() ) then
-						return false
-					elseif ( parsed_u.host == nil or \
parsed_u.host:sub(-#o.base_url:getDomain()):lower() ~= o.base_url:getDomain():lower() \
                ) then
-						return false
-					end
-					return true
-				end
-			end
-		end
 		setmetatable(o, self)
 		self.__index = self
 		return o
@@ -153,6 +133,41 @@
 		return o
 	end,
 	
+   removewww = function(url) return string.gsub(url, "^www%.", "") end,
+
+   iswithinhost = function(self, u, o)
+                    
+        local parsed_u = url.parse(tostring(u))
+                                        
+        if ( o.base_url:getPort() ~= 80 and o:getPort() ~= 443 ) then
+            if ( tonumber(parsed_u.port) ~= tonumber(o.base_url:getPort()) ) then
+                return false
+            end
+        elseif ( parsed_u.scheme ~= o.base_url:getProto() ) then
+            return false
+      -- if urls don't match only on the "www" prefix, then they are probably the \
same +        elseif ( parsed_u.host == nil or self.removewww(parsed_u.host:lower()) \
~= self.removewww(o.base_url:getHost():lower()) ) then +            return false
+        end
+        return true
+    end,
+
+    iswithindomain = function(self, u, o)
+
+        local parsed_u = url.parse(tostring(u))				
+
+        if ( o.base_url:getPort() ~= 80 and o.base_url:getPort() ~= 443 ) then
+            if ( tonumber(parsed_u.port) ~= tonumber(o.base_url:getPort()) ) then
+                return false
+            end
+        elseif ( parsed_u.scheme ~= o.base_url:getProto() ) then
+            return false
+        elseif ( parsed_u.host == nil or \
parsed_u.host:sub(-#o.base_url:getDomain()):lower() ~= o.base_url:getDomain():lower() \
) then +            return false
+        end
+        return true
+    end,
+
 	-- is the link absolute or not?
 	isAbsolute = function(url)
 		-- at this point we don't care about the protocol
@@ -171,6 +186,11 @@
 	-- @return link string containing the absolute link
 	createAbsolute = function(base_url, rel_url, base_href)
 
+        -- is protocol-relative?
+        if rel_url:match("^//") then
+			return ("%s%s%s"):format(base_url:getProto(), ":", rel_url)
+        end
+
 		-- is relative with leading slash? ie /dir1/foo.html
 		local leading_slash = rel_url:match("^/")
 		rel_url = rel_url:match("^/?(.*)") or '/'
@@ -226,7 +246,7 @@
 			end
 		end
 	end,
-	
+
   	validate_link = function(self, url)
 		local valid = true
 
@@ -248,7 +268,7 @@
 
 		-- withindomain trumps any whitelisting
 		if ( self.options.withindomain ) then
-			if ( not(self.options.withindomain(url)) ) then
+			if ( not(self.iswithindomain(self, url, self.options)) ) then
 				stdnse.print_debug(2, "%s: Link is not within domain: %s", LIBRARY_NAME, \
tostring(url))  return false
 			end
@@ -256,7 +276,7 @@
 
 		-- withinhost trumps any whitelisting
 		if ( self.options.withinhost ) then
-			if ( not(self.options.withinhost(url)) ) then
+			if ( not(self.iswithinhost(self, url, self.options)) ) then
 				stdnse.print_debug(2, "%s: Link is not within host: %s", LIBRARY_NAME, \
tostring(url))  return false
 			end
@@ -503,6 +523,7 @@
 	--        <code>maxpagecount</code> - the maximum amount of pages to retrieve
 	--        <code>withinhost</code> - stay within the host of the base_url
 	--        <code>withindomain</code> - stay within the base_url domain
+	--        <code>blacklistforscraping</code> - don't scrape resources defined in \
                this table
 	--        <code>scriptname</code> - should be set to SCRIPT_NAME to enable
 	--                                  script specific arguments.
 	--        <code>redirect_ok</code> - redirect_ok closure to pass to http.get \
function @@ -553,7 +574,7 @@
 		if ( not(o.options.maxpagecount) ) then
 			o.options.maxpagecount = \
tonumber(stdnse.get_script_args("httpspider.maxpagecount"))  end
-		
+
 		if ( not(o.options.noblacklist) ) then
 			o:addDefaultBlacklist()
 		end
@@ -631,6 +652,7 @@
 	-- This way the script can alert the user of the details by calling
 	-- getError()
 	crawl_thread = function(self, response_queue)
+        
 		local condvar = nmap.condvar(response_queue)
 
 		if ( false ~= self.options.withinhost and false ~= self.options.withindomain ) \
then @@ -716,11 +738,23 @@
 						url.path = link
 					end
 				end
-				-- if we have a response, proceed scraping it
-				if ( response.body ) then
-					local links = LinkExtractor:new(url, response.body, self.options):getLinks()
-					self.urlqueue:add(links)
-				end		
+                local noscraping = false
+                
+                for _, resource in ipairs(self.options.blacklistforscraping) do
+                    if (string.sub(tostring(url), -string.len(resource.extension) - \
1)== "." .. resource.extension or  +                    resource.extension == "*") \
then  +                        if (resource.location == "outsidehost" and \
not(LinkExtractor:iswithinhost(url, self.options))) or  +                        \
(resource.location == "withinhost" and LinkExtractor:iswithinhost(url, self.options)) \
or  +                        (resource.location == "withinoroutsidehost") then
+                            noscraping = true
+                        end
+                    end
+                end
+				-- if we have a response and the resource is not in the blacklist for scraping, \
proceed scraping it +				if response.body and noscraping == false then
+                    local links = LinkExtractor:new(url, response.body, \
self.options):getLinks() +                    self.urlqueue:add(links)
+				end
 			else
 				response = { body = "", headers = {} }
 			end
@@ -763,6 +797,9 @@
 	  if ( nil == self.options.useheadfornonwebfiles ) then
 			self.options.useheadfornonwebfiles = stdnse.get_script_args(sn .. \
".useheadfornonwebfiles")  end
+        if ( nil == self.options.blacklistforscraping ) then
+			self.options.blacklistforscraping = stdnse.get_script_args(sn .. \
".blacklistforscraping") +		end
 	end,
 	
 	-- Loads the argument on a library level
@@ -790,6 +827,9 @@
 	  if ( nil == self.options.useheadfornonwebfiles ) then
 			self.options.useheadfornonwebfiles = stdnse.get_script_args(ln .. \
".useheadfornonwebfiles")  end
+        if ( nil == self.options.blacklistforscraping ) then
+			self.options.blacklistforscraping = stdnse.get_script_args(ln .. \
".blacklistforscraping") +		end
 	end,
 	
 	-- Loads any defaults for arguments that were not set
@@ -819,8 +859,11 @@
 		self.options.withinhost = tobool(self.options.withinhost)
 		self.options.withindomain = tobool(self.options.withindomain)
 		self.options.noblacklist = tobool(self.options.noblacklist)
-		self.options.useheadfornonwebfiles = tobool(self.options.useheadfornonwebfiles)
+		self.options.useheadfornonwebfiles = tobool(self.options.useheadfornonwebfiles) 
 
+		if type(self.options.blacklistforscraping) ~= "table" then
+            self.options.blacklistforscraping = {}
+        end
 		if ( self.options.withinhost == nil ) then
 			if ( self.options.withindomain ~= true ) then
 				self.options.withinhost = true
@@ -830,9 +873,12 @@
 		end
 		if ( self.options.withindomain == nil ) then
 			self.options.withindomain = false
-		end
+		end 
+        if ( self.options.blacklistforscraping == nil ) then
+            self.options.blacklistforscraping = {} 
+        end
 		self.options.maxdepth = self.options.maxdepth or 3
-		self.options.maxpagecount = self.options.maxpagecount or 20
+		self.options.maxpagecount = self.options.maxpagecount or 20 
 		self.url = self.url or '/'
 	end,	
 	
@@ -863,6 +909,7 @@
 	
 	-- does the crawling
 	crawl = function(self)
+        
 		self.response_queue = self.response_queue or {}
 		local condvar = nmap.condvar(self.response_queue)
 		if ( not(self.thread) ) then



_______________________________________________
Sent through the dev mailing list
http://nmap.org/mailman/listinfo/dev
Archived at http://seclists.org/nmap-dev/

[prev in list] [next in list] [prev in thread] [next in thread] 

Configure | About | News | Add a list | Sponsored by KoreLogic