[prev in list] [next in list] [prev in thread] [next in thread]
List: james-dev
Subject: URI blacklist Matcher
From: Michael Bryant <mgbryant () 4mi ! net>
Date: 2004-12-31 19:41:06
Message-ID: E9C25658-5B63-11D9-AA12-000393C2FEBA () 4mi ! net
[Download RAW message or body]
Hello,
I have developed a Matcher that works with URI blacklists (see
http://www.surbl.org). This Matcher scans the body of the message for
domain names. Then domain lookups are performed against supplied URI
blacklists (Spamcop call these "spamvertised" websites). If a hit is
found on any of the domains, all recipients are returned by the
Matcher.
If there is interest, I would like to contribute this code to the James
project. There may be issues with the way the code and TLD data is
currently organized, the use of java.util.regex, etc. I will be happy
to help out with any additional work that might need to be done.
-Mike Bryant.
["InURIBlacklists.java" (InURIBlacklists.java)]
/***********************************************************************
* Copyright (c) 2004 Michael Bryant . *
* All rights reserved. *
* ------------------------------------------------------------------- *
* Licensed under the Apache License, Version 2.0 (the "License"); you *
* may not use this file except in compliance with the License. You *
* may obtain a copy of the License at: *
* *
* http://www.apache.org/licenses/LICENSE-2.0 *
* *
* Unless required by applicable law or agreed to in writing, software *
* distributed under the License is distributed on an "AS IS" BASIS, *
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or *
* implied. See the License for the specific language governing *
* permissions and limitations under the License. *
* ------------------------------------------------------------------- *
* This software contains code derived from the Apache James Project. *
***********************************************************************/
package net._4mi.james.matchers;
import org.apache.mailet.GenericMatcher;
import org.apache.mailet.Mail;
import net._4mi.james.matchers.util.URIScanner;
import java.util.Collection;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.Iterator;
import java.util.StringTokenizer;
import java.net.UnknownHostException;
import java.io.IOException;
import javax.mail.MessagingException;
import javax.mail.internet.MimeBodyPart;
import javax.mail.internet.MimeMessage;
import javax.mail.internet.MimeMultipart;
import javax.mail.internet.MimePart;
/**
* Scans the message body for URIs, then checks them against a set of
* URI spammer blacklists. (See http://www.surbl.org.)
*
* Example:
* <mailet match="InURIBlackLists=sc.surbl.org,ab.surbl.org" class="ToProcessor">
* <processor>spam</processor>
* </mailet>
*
*/
public class InURIBlacklists extends GenericMatcher {
/** Set of URI black lists to use */
private final ArrayList uribls = new ArrayList();
/**
* Initializes this Matcher with a list URI blacklist domains
*/
public void init() throws javax.mail.MessagingException {
StringTokenizer st = new StringTokenizer(getCondition(), ", \t", false);
while (st.hasMoreTokens()) {
uribls.add(st.nextToken());
}
log("uribls="+uribls);
}
/**
*
* Scans the message body for URIs, then checks them against a set of
* URI blacklists. (See http://www.surbl.org.) If any hits are found, all
* recipients are returned.
*
* @param mail the Mail object which contains a MimeMessage and routing
* information
* @return a Collection of recipients. If any URI hits are found, returns
* all of the mail's recipients, returns null otherwise.
* @throws javax.mail.MessagingException - if an exception occurs that
* interferes with the mailet's normal operation occurred
*/
public Collection match(Mail mail)
throws MessagingException {
MimeMessage message = mail.getMessage();
log("doing URIBL lookup on mail w/ subject: \""+message.getSubject()+"\"");
HashSet domains = new HashSet(20);
try {
scanMailForDomains(domains, message);
}
catch (IOException ioe) {
throw new MessagingException("Could not read MimeMessage", ioe);
}
//log(stack2string(new Exception("foo")));
log("found domains: "+domains);
for (Iterator i=domains.iterator(); i.hasNext();) {
String domain = (String)i.next();
log("looking up: \""+domain+"\"");
for (Iterator j=uribls.iterator(); j.hasNext();) {
String uribl = (String)j.next();
log("using uribl: \""+uribl+"\"");
String target = domain + "." + uribl;
log("target: \""+target+"\"");
try {
org.apache.james.dnsserver.DNSServer.getByName(target);
log("got a hit: \""+target+"\"");
return mail.getRecipients();
}
catch (UnknownHostException uhe) {
// domain not found. keep processing
}
}
}
log("no spammy URIs");
return null;
}
/**
* Recursively scans all MimeParts of an email for domain strings. Domain
* strings that are found are added to the supplied HashSet.
*
* @param domains HashSet for accumulating domain strings
* @param part MimePart to scan
*/
protected void scanMailForDomains(HashSet domains, MimePart part)
throws MessagingException, IOException {
log(" mime type is: \""+part.getContentType()+"\"");
if (part.isMimeType("text/plain") || part.isMimeType("text/html")) {
log(" scanning: \""+part.getContent().toString()+"\"");
URIScanner.scanContentForDomains(domains, part.getContent().toString());
}
else if (part.isMimeType("multipart/*")) {
MimeMultipart multipart = (MimeMultipart)part.getContent();
int count = multipart.getCount();
log(" multipart count is: "+count);
for (int index=0; index<count; index++) {
log(" recursing index: "+index);
MimeBodyPart mimeBodyPart = (MimeBodyPart)multipart.getBodyPart(index);
scanMailForDomains(domains, mimeBodyPart);
}
}
}
/**
static protected String stack2string(Exception e) {
try {
java.io.StringWriter sw = new java.io.StringWriter();
java.io.PrintWriter pw = new java.io.PrintWriter(sw);
e.printStackTrace(pw);
return "------" + sw.toString() + "------";
}
catch(Exception e2) {
return "bad stack2string";
}
}
**/
}
["URIScanner.java" (URIScanner.java)]
/***********************************************************************
* Copyright (c) 2004 Michael Bryant . *
* All rights reserved. *
* ------------------------------------------------------------------- *
* Licensed under the Apache License, Version 2.0 (the "License"); you *
* may not use this file except in compliance with the License. You *
* may obtain a copy of the License at: *
* *
* http://www.apache.org/licenses/LICENSE-2.0 *
* *
* Unless required by applicable law or agreed to in writing, software *
* distributed under the License is distributed on an "AS IS" BASIS, *
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or *
* implied. See the License for the specific language governing *
* permissions and limitations under the License. *
* ------------------------------------------------------------------- *
* This software contains regular expression code derived from the *
* Apache Spmassassin Project. *
***********************************************************************/
package net._4mi.james.matchers.util;
import java.util.HashSet;
import java.util.Iterator;
import java.util.StringTokenizer;
import java.util.regex.*;
import java.net.URI;
import java.net.UnknownHostException;
import java.io.IOException;
public class URIScanner {
/* These regular expressions "inspired" by Spamassassin */
static private final String reserved = ";/?:@&=+$,[]\\#|";
static private final String reservedNoColon = ";/?@&=+$,[]\\#|";
static private final String mark = "-_.!~*'()";
static private final String unreserved = "A-Za-z0-9" + escape(mark) + \
"\\x00-\\x08\\x0b\\x0c\\x0e-\\x1f"; static private final String uricSet = \
escape(reserved) + unreserved + "%"; static private final String uricNoColon = \
escape(reservedNoColon) + unreserved + "%"; static private final String schemeRE \
= "(?-xism:(?:https?|ftp|mailto|javascript|file))"; static private final String \
schemelessRE = "(?-xism:(?<![.=])(?:(?i)www\\d*\\.|(?i)ftp\\.))"; static private \
final String uriRE = \
"(?-xism:\\b(?:"+schemeRE+":["+uricNoColon+"]|"+schemelessRE+")["+uricSet+"#]*)";
/** Pre-compiled pattern that matches URIs */
static private final Pattern uriPattern = Pattern.compile(uriRE);
/** Pre-compiled pattern that matches URI scheme strings */
static private final Pattern schemePattern = Pattern.compile("^"+schemeRE+":");
/** Pre-compiled pattern used to cleanup a found URI string */
static private final Pattern uriCleanup = Pattern.compile("^<(.*)>$");
/** Pre-compiled pattern used to cleanup a found URI string */
static private final Pattern uriCleanup2 = Pattern.compile("[\\]\\)>#]$");
/** Pre-compile pattern for identifying "mailto" patterns */
static private final Pattern uriCleanup3 = \
Pattern.compile("^(?i)mailto:([^\\/]{2})(.*)$");
/* These regular expressions also "inspired" by Spamassassin */
static private final String esc = "\\\\";
static private final String period = "\\.";
static private final String space = "\\040";
static private final String open_br = "\\[";
static private final String close_br = "\\]";
static private final String nonASCII = "\\x80-\\xff";
static private final String ctrl = "\\000-\\037";
static private final String cr_list = "\\n\\015";
static private final String qtext = "[^"+esc+nonASCII+cr_list+"\"]";
static private final String dtext = "[^"+esc+nonASCII+cr_list+open_br+close_br+"]";
static private final String quoted_pair = esc+"[^"+nonASCII+"]";
static private final String atom_char = \
"[^("+space+")<>@,;:\"."+esc+open_br+close_br+ctrl+nonASCII+"]"; static private \
final String atom = "(?>"+atom_char+"+)"; static private final String quoted_str = \
"\""+qtext+"*(?:"+quoted_pair+qtext+"*)*\""; static private final String word = \
"(?:"+atom+"|"+quoted_str+")"; static private final String local_part = \
word+"(?:"+period+word+")*"; static private final String label = \
"[A-Za-z\\d](?:[A-Za-z\\d-]*[A-Za-z\\d])?"; static private final String domain_ref \
= label+"(?:"+period+label+")*"; static private final String domain_lit = \
open_br+"(?:"+dtext+"|"+quoted_pair+")*"+close_br; static private final String \
domain = "(?:"+domain_ref+"|"+domain_lit+")"; static private final String \
Addr_spec_re = "(?-xism:"+local_part+"\\s*\\@\\s*"+domain+")";
/** Pre-compiled pattern for matching "schemeless" mailto strings */
static private final Pattern emailAddrPattern = Pattern.compile(Addr_spec_re);
/** Simple reqular expression to match an octet part of an IP address */
static private final String octet = \
"(?:[1-2][0-9][0-9])|(?:[1-9][0-9])|(?:[0-9])";
/** Simple regular expression to match a part of a domain string in the
TLDLookup cache. */
static private final String tld = "[A-Za-z0-9\\-]*";
/** Simple regular expression that matches a two-part TLD */
static private final String tld2 = tld+"\\."+tld;
/** Simple regular expression that matches a three-part TLD */
static private final String tld3 = tld+"\\."+tld+"\\."+tld;
/** Regular expression that matches and captures parts of a possible
one-part TLD domain string */
static private final String tldCap = "("+tld+"\\.("+tld+"))$";
/** Regular expression that matches and captures parts of a possible
two-part TLD domain string */
static private final String tld2Cap = "("+tld+"\\.("+tld2+"))$";
/** Regular expression that matches and captures parts of a possible
three-part TLD domain string */
static private final String tld3Cap = "("+tld+"\\.("+tld3+"))$";
/** Regular expression that matches and captures parts of an IP address */
static private final String ipCap = \
"(("+octet+")\\.("+octet+")\\.("+octet+")\\.("+octet+"))$";
/** Pre-compiled pattern that matches IP addresses */
static private final Pattern ipCapPattern = Pattern.compile(ipCap);
/** Pre-compiled pattern that matches domain string that is possibly
contained in a one-part TLD */
static private final Pattern tldCapPattern = Pattern.compile(tldCap);
/** Pre-compiled pattern that matches domain string that is possibly
contained in a two-part TLD */
static private final Pattern tld2CapPattern = Pattern.compile(tld2Cap);
/** Pre-compiled pattern that matches domain string that is possibly
contained in a three-part TLD */
static private final Pattern tld3CapPattern = Pattern.compile(tld3Cap);
/** controls testing/debug output */
static private boolean testing = false;
/**
* Scans a character sequence for URIs. Then add all unique domain strings
* derived from those found URIs to the supplied HashSet.
* <p>
* This function calls scanContentForHosts() to grab all the host strings.
* Then it calls domainFromHost() on each host string found to distill them
* to their basic "registrar" domains.
*
* @param domains a HashSet to be populated with all domain strings found in
* the content
* @param content a character sequence to be scanned for URIs
*/
static public void scanContentForDomains(HashSet domains, CharSequence content) {
HashSet hosts = scanContentForHosts(content);
for (Iterator i = hosts.iterator(); i.hasNext();) {
String domain = domainFromHost((String)i.next());
if (null != domain) {
if (false == domains.contains(domain)) {
domains.add(domain);
}
}
}
}
/**
* Scans a character sequence for URIs. Then returns all unique host strings
* derived from those found URIs in a HashSet
*
* @param content a character sequence to be scanned for URIs
* @return a HashSet containing host strings
*/
static protected HashSet scanContentForHosts(CharSequence content) {
HashSet set = new HashSet();
try {
// look for URIs
Matcher mat = uriPattern.matcher(content);
while (mat.find()) {
String found = mat.group();
Matcher cleanMat = uriCleanup.matcher(found);
if (cleanMat.find()) {
found = cleanMat.group(1);
}
cleanMat = uriCleanup2.matcher(found);
if (cleanMat.find()) {
found = cleanMat.replaceAll("");
}
cleanMat = uriCleanup3.matcher(found);
if (cleanMat.find()) {
found = "mailto://"+cleanMat.group(1)+cleanMat.group(2);
}
cleanMat = schemePattern.matcher(found);
if (!cleanMat.find()) {
if (found.matches("^(?i)www\\d*\\..*")) {
found = "http://" + found;
}
else if (found.matches("^(?i)ftp\\..*")) {
found = "ftp://" + found;
}
}
String host = hostFromUriStr(found);
if (null != host) {
host = host.toLowerCase();
if (false == set.contains(host)) {
set.add(host);
}
}
}
// look for "schemeless" email addresses, too
mat = emailAddrPattern.matcher(content);
while (mat.find()) {
String found = mat.group();
debugOut("******** mailfound=\""+found+"\"");
found = "mailto://"+found;
debugOut("*******6 mailfoundfound=\""+found+"\" after cleanup 6");
String host = hostFromUriStr(found);
if (null != host) {
host = host.toLowerCase();
if (false == set.contains(host)) {
set.add(host);
}
}
}
}
catch (Exception ex) {
debugOut(ex.toString());
ex.printStackTrace();
}
return set;
}
/**
* Extracts and returns the host portion of URI string.
*
* This function uses java.net.URI.
*
* @param uriStr a string containing a URI
* @return the host portion of the supplied URI, null if no host string
* could be found
*/
static protected String hostFromUriStr(String uriStr) {
debugOut("hostFromUriStr(\""+uriStr+"\")");
String host = null;
try {
URI uri = new URI(uriStr);
host = uri.getHost();
}
catch (Exception ex) {
}
return host;
}
/**
* Extracts and returns the registrar domain portion of a host string. This
* funtion checks all known multi-part TLDs to make sure that registrar
* domain is complete. For example, if the supplied host string is
* "subdomain.example.co.uk", the TLD is "co.uk" and not "uk". Therefore,
* the correct registrar domain is not "co.uk", but "example.co.uk". If the
* domain string is an IP address, then the octets are returned in reverse
* order.
*
* @param host a string containing a host name
* @return the registrar domain portion of the supplied host string
*/
static protected String domainFromHost(String host) {
debugOut("domainFromHost(\""+host+"\")");
String domain = null;
Matcher mat;
try {
// IP addrs
mat = ipCapPattern.matcher(host);
if (mat.find()) {
// reverse the octets now
domain = \
mat.group(5)+"."+mat.group(4)+"."+mat.group(3)+"."+mat.group(2); \
debugOut("domain=\""+domain+"\""); return domain;
}
// 3-part TLDs
mat = tld3CapPattern.matcher(host);
if (mat.find()) {
String tld = mat.group(2);
if (TLDLookup.isThreePartTLD(tld)) {
domain = mat.group(1);
debugOut("domain=\""+domain+", tld=\""+tld+"\"");
return domain;
}
}
// 2-part TLDs
mat = tld2CapPattern.matcher(host);
if (mat.find()) {
String tld = mat.group(2);
if (TLDLookup.isTwoPartTLD(tld)) {
domain = mat.group(1);
debugOut("domain=\""+domain+", tld=\""+tld+"\"");
return domain;
}
}
// 1-part TLDs
mat = tldCapPattern.matcher(host);
if (mat.find()) {
String tld = mat.group(2);
domain = mat.group(1);
debugOut("domain=\""+domain+", tld=\""+tld+"\"");
return domain;
}
}
catch (Exception ex) {
debugOut(ex.toString());
ex.printStackTrace();
}
return domain;
}
/**
* Debugging output
*/
private static void debugOut(String msg) {
if (true == testing) {
System.out.println(msg);
}
}
/**
* Test driver
*/
public static void main(String args[]) {
testing = true;
String str = "jhl http://123.234.12.34/foo.html kh mailto:woof@woof.co.uk \
jlksjl <http://Www.foo.org> hkjhkjhk kljhlkj www3.foobar.org kjhk wWw.foojar.org jh \
fTp.foot.com lhjhkj h www.foo.org";
debugOut("str=\""+str+"\"");
HashSet domains = new HashSet();
scanContentForDomains(domains, str);
for (Iterator i=domains.iterator(); i.hasNext();) {
String domain = (String)i.next();
debugOut("domain = "+domain);
}
}
/**
* A utility function that "escapes" special characters in a string.
*
* @param str a string to be processed
* @return modified "escaped" string
*/
private static String escape(String str) {
StringBuffer buffer = new StringBuffer();
for (int i=0; i<str.length(); i++) {
char ch = str.charAt(i);
if (Character.isDigit(ch) || Character.isUpperCase(ch) || \
Character.isLowerCase(ch) || ch == '_') { buffer.append(ch);
}
else {
buffer.append("\\");
buffer.append(ch);
}
}
return buffer.toString();
}
}
["TLDLookup.java" (TLDLookup.java)]
/***********************************************************************
* Copyright (c) 2004 Michael Bryant . *
* All rights reserved. *
* ------------------------------------------------------------------- *
* Licensed under the Apache License, Version 2.0 (the "License"); you *
* may not use this file except in compliance with the License. You *
* may obtain a copy of the License at: *
* *
* http://www.apache.org/licenses/LICENSE-2.0 *
* *
* Unless required by applicable law or agreed to in writing, software *
* distributed under the License is distributed on an "AS IS" BASIS, *
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or *
* implied. See the License for the specific language governing *
* permissions and limitations under the License. *
***********************************************************************/
package net._4mi.james.matchers.util;
import java.util.HashSet;
/**
* A utility class that caches sets of multi-part top level domains (TLDs) for
* quick lookup.
*/
public class TLDLookup {
/** Simple regular expression to match strings in the cache. Note: if the
collection of known mult-part TLDs change to contain characters other
than these, this string must be modified. */
static private final String tld = "[A-Za-z0-9\\-]*";
/** Simple regular expression that matches a two-part TLD */
static private final String tld2 = tld+"\\."+tld;
/** Simple regular expression that matches a three-part TLD */
static private final String tld3 = tld+"\\."+tld+"\\."+tld;
/** Array of all known multi-level TLDs */
static private final String[] multiPartTLDs = initMultiPartTLDs();
/** A set of all known two-part TLDs */
static private final HashSet twoPartTLDs = initTwoPartTLDs();
/** A set of all known three-part TLDs */
static private final HashSet threePartTLDs = initThreePartTLDs();
/** controls testing/debug output */
static private boolean testing = false;
/**
* Determines if a two-part domain string (xxx.xxx) is contained in the
* cache of known two-part TLDs.
*
* @param domain a String representing a two-part domain
* @return true if the domain string is found in the cache, false otherwise
*/
static public boolean isTwoPartTLD(String domain) {
return twoPartTLDs.contains(domain);
}
/**
* Determines if a three-part domain string (xxx.xxx.xxx) is contained in
* the cache of known three-part TLDs.
*
* @param domain a String representing a three-part domain
* @return true if the domain string is found in the cache, false otherwise
*/
static public boolean isThreePartTLD(String domain) {
return threePartTLDs.contains(domain);
}
/**
* Initialize two-part top-level domain cache.
*
* @return a HashSet containing all known two-part TLDs
*/
static private HashSet initTwoPartTLDs() {
HashSet set = new HashSet(900);
for (int i=0; i<multiPartTLDs.length; i++) {
try {
if (multiPartTLDs[i].matches("^"+tld2+"$")) {
set.add(multiPartTLDs[i]);
}
}
catch (Exception ex) {
debugOut(ex);
}
}
debugOut("initTwoPartTLDs size="+set.size());
return set;
}
/**
* Initialize three-part top-level domain cache.
*
* @return a HashSet containing all known three-part TLDs
*/
static private HashSet initThreePartTLDs() {
HashSet set = new HashSet();
for (int i=0; i<multiPartTLDs.length; i++) {
try {
if (multiPartTLDs[i].matches("^"+tld3+"$")) {
debugOut("adding \"" + multiPartTLDs[i] + "\"");
set.add(multiPartTLDs[i]);
}
}
catch (Exception ex) {
debugOut(ex);
}
}
debugOut("initThreePartTLDs size="+set.size());
return set;
}
/**
* Initialize an array of Strings containing all known multi-part TLDs
*
* @return an array of all known multi-part TLDs
*/
static private String[] initMultiPartTLDs() {
String[] tmp = new String[] {
"com.ac",
"edu.ac",
"gov.ac",
"edu.ai",
"gov.ai",
"com.ar",
"net.ar",
"org.ar",
"gov.ar",
"mil.ar",
"edu.ar",
"int.ar",
"co.at",
"ac.at",
"or.at",
"gv.at",
"priv.at",
"com.au",
"gov.au",
"org.au",
"edu.au",
"id.au",
"oz.au",
"info.au",
"net.au",
"asn.au",
"csiro.au",
"telememo.au",
"conf.au",
"otc.au",
"com.az",
"net.az",
"org.az",
"com.bb",
"net.bb",
"org.bb",
"ac.be",
"belgie.be",
"dns.be",
"fgov.be",
"com.bh",
"gov.bh",
"net.bh",
"edu.bh",
"org.bh",
"com.bm",
"edu.bm",
"gov.bm",
"org.bm",
"net.bm",
"adm.br",
"adv.br",
"agr.br",
"am.br",
"arq.br",
"art.br",
"ato.br",
"bio.br",
"bmd.br",
"cim.br",
"cng.br",
"cnt.br",
"com.br",
"coop.br",
"ecn.br",
"edu.br",
"eng.br",
"esp.br",
"etc.br",
"eti.br",
"far.br",
"fm.br",
"fnd.br",
"fot.br",
"fst.br",
"g12.br",
"ggf.br",
"gov.br",
"imb.br",
"ind.br",
"inf.br",
"jor.br",
"lel.br",
"mat.br",
"med.br",
"mil.br",
"mus.br",
"net.br",
"nom.br",
"not.br",
"ntr.br",
"odo.br",
"org.br",
"ppg.br",
"pro.br",
"psc.br",
"psi.br",
"qsl.br",
"rec.br",
"slg.br",
"srv.br",
"tmp.br",
"trd.br",
"tur.br",
"tv.br",
"vet.br",
"zlg.br",
"com.bs",
"net.bs",
"org.bs",
"ab.ca",
"bc.ca",
"mb.ca",
"nb.ca",
"nf.ca",
"nl.ca",
"ns.ca",
"nt.ca",
"nu.ca",
"on.ca",
"pe.ca",
"qc.ca",
"sk.ca",
"yk.ca",
"co.ck",
"net.ck",
"org.ck",
"edu.ck",
"gov.ck",
"com.cn",
"edu.cn",
"gov.cn",
"net.cn",
"org.cn",
"ac.cn",
"ah.cn",
"bj.cn",
"cq.cn",
"gd.cn",
"gs.cn",
"gx.cn",
"gz.cn",
"hb.cn",
"he.cn",
"hi.cn",
"hk.cn",
"hl.cn",
"hn.cn",
"jl.cn",
"js.cn",
"ln.cn",
"mo.cn",
"nm.cn",
"nx.cn",
"qh.cn",
"sc.cn",
"sn.cn",
"sh.cn",
"sx.cn",
"tj.cn",
"tw.cn",
"xj.cn",
"xz.cn",
"yn.cn",
"zj.cn",
"arts.co",
"com.co",
"edu.co",
"firm.co",
"gov.co",
"info.co",
"int.co",
"nom.co",
"mil.co",
"org.co",
"rec.co",
"store.co",
"web.co",
"ac.cr",
"co.cr",
"ed.cr",
"fi.cr",
"go.cr",
"or.cr",
"sa.cr",
"com.cu",
"net.cu",
"org.cu",
"ac.cy",
"com.cy",
"gov.cy",
"net.cy",
"org.cy",
"co.dk",
"art.do",
"com.do",
"edu.do",
"gov.do",
"org.do",
"mil.do",
"net.do",
"web.do",
"com.dz",
"org.dz",
"net.dz",
"gov.dz",
"edu.dz",
"ass.dz",
"pol.dz",
"art.dz",
"com.ec",
"k12.ec",
"edu.ec",
"fin.ec",
"med.ec",
"gov.ec",
"mil.ec",
"org.ec",
"net.ec",
"com.eg",
"edu.eg",
"eun.eg",
"gov.eg",
"net.eg",
"org.eg",
"sci.eg",
"com.er",
"net.er",
"org.er",
"edu.er",
"mil.er",
"gov.er",
"ind.er",
"com.et",
"gov.et",
"org.et",
"edu.et",
"net.et",
"biz.et",
"name.et",
"info.et",
"ac.fj",
"com.fj",
"gov.fj",
"id.fj",
"org.fj",
"school.fj",
"com.fk",
"ac.fk",
"gov.fk",
"net.fk",
"nom.fk",
"org.fk",
"asso.fr",
"nom.fr",
"barreau.fr",
"com.fr",
"prd.fr",
"presse.fr",
"tm.fr",
"aeroport.fr",
"assedic.fr",
"avocat.fr",
"avoues.fr",
"cci.fr",
"chambagri.fr",
"chirurgiens-dentistes.fr",
"experts-comptables.fr",
"geometre-expert.fr",
"gouv.fr",
"greta.fr",
"huissier-justice.fr",
"medecin.fr",
"notaires.fr",
"pharmacien.fr",
"port.fr",
"veterinaire.fr",
"com.ge",
"edu.ge",
"gov.ge",
"mil.ge",
"net.ge",
"org.ge",
"pvt.ge",
"co.gg",
"org.gg",
"sch.gg",
"ac.gg",
"gov.gg",
"ltd.gg",
"ind.gg",
"net.gg",
"alderney.gg",
"guernsey.gg",
"sark.gg",
"com.gu",
"edu.gu",
"net.gu",
"org.gu",
"gov.gu",
"mil.gu",
"com.hk",
"net.hk",
"org.hk",
"idv.hk",
"gov.hk",
"edu.hk",
"co.hu",
"2000.hu",
"erotika.hu",
"jogasz.hu",
"sex.hu",
"video.hu",
"info.hu",
"agrar.hu",
"film.hu",
"konyvelo.hu",
"shop.hu",
"org.hu",
"bolt.hu",
"forum.hu",
"lakas.hu",
"suli.hu",
"priv.hu",
"casino.hu",
"games.hu",
"media.hu",
"szex.hu",
"sport.hu",
"city.hu",
"hotel.hu",
"news.hu",
"tozsde.hu",
"tm.hu",
"erotica.hu",
"ingatlan.hu",
"reklam.hu",
"utazas.hu",
"ac.id",
"co.id",
"go.id",
"mil.id",
"net.id",
"or.id",
"co.il",
"net.il",
"org.il",
"ac.il",
"gov.il",
"k12.il",
"muni.il",
"idf.il",
"co.im",
"net.im",
"org.im",
"ac.im",
"lkd.co.im",
"gov.im",
"nic.im",
"plc.co.im",
"co.in",
"net.in",
"ac.in",
"ernet.in",
"gov.in",
"nic.in",
"res.in",
"gen.in",
"firm.in",
"mil.in",
"org.in",
"ind.in",
"ac.je",
"co.je",
"net.je",
"org.je",
"gov.je",
"ind.je",
"jersey.je",
"ltd.je",
"sch.je",
"com.jo",
"org.jo",
"net.jo",
"gov.jo",
"edu.jo",
"mil.jo",
"ad.jp",
"ac.jp",
"co.jp",
"go.jp",
"or.jp",
"ne.jp",
"gr.jp",
"ed.jp",
"lg.jp",
"net.jp",
"org.jp",
"gov.jp",
"hokkaido.jp",
"aomori.jp",
"iwate.jp",
"miyagi.jp",
"akita.jp",
"yamagata.jp",
"fukushima.jp",
"ibaraki.jp",
"tochigi.jp",
"gunma.jp",
"saitama.jp",
"chiba.jp",
"tokyo.jp",
"kanagawa.jp",
"niigata.jp",
"toyama.jp",
"ishikawa.jp",
"fukui.jp",
"yamanashi.jp",
"nagano.jp",
"gifu.jp",
"shizuoka.jp",
"aichi.jp",
"mie.jp",
"shiga.jp",
"kyoto.jp",
"osaka.jp",
"hyogo.jp",
"nara.jp",
"wakayama.jp",
"tottori.jp",
"shimane.jp",
"okayama.jp",
"hiroshima.jp",
"yamaguchi.jp",
"tokushima.jp",
"kagawa.jp",
"ehime.jp",
"kochi.jp",
"fukuoka.jp",
"saga.jp",
"nagasaki.jp",
"kumamoto.jp",
"oita.jp",
"miyazaki.jp",
"kagoshima.jp",
"okinawa.jp",
"sapporo.jp",
"sendai.jp",
"yokohama.jp",
"kawasaki.jp",
"nagoya.jp",
"kobe.jp",
"kitakyushu.jp",
"utsunomiya.jp",
"kanazawa.jp",
"takamatsu.jp",
"matsuyama.jp",
"com.kh",
"net.kh",
"org.kh",
"per.kh",
"edu.kh",
"gov.kh",
"mil.kh",
"ac.kr",
"co.kr",
"go.kr",
"ne.kr",
"or.kr",
"pe.kr",
"re.kr",
"seoul.kr",
"kyonggi.kr",
"com.kw",
"net.kw",
"org.kw",
"edu.kw",
"gov.kw",
"com.la",
"net.la",
"org.la",
"com.lb",
"org.lb",
"net.lb",
"edu.lb",
"gov.lb",
"mil.lb",
"com.lc",
"edu.lc",
"gov.lc",
"net.lc",
"org.lc",
"com.lv",
"net.lv",
"org.lv",
"edu.lv",
"gov.lv",
"mil.lv",
"id.lv",
"asn.lv",
"conf.lv",
"com.ly",
"net.ly",
"org.ly",
"co.ma",
"net.ma",
"org.ma",
"press.ma",
"ac.ma",
"com.mk",
"com.mm",
"net.mm",
"org.mm",
"edu.mm",
"gov.mm",
"com.mo",
"net.mo",
"org.mo",
"edu.mo",
"gov.mo",
"com.mt",
"net.mt",
"org.mt",
"edu.mt",
"tm.mt",
"uu.mt",
"com.mx",
"net.mx",
"org.mx",
"com.my",
"org.my",
"gov.my",
"edu.my",
"net.my",
"com.na",
"org.na",
"net.na",
"alt.na",
"edu.na",
"cul.na",
"unam.na",
"telecom.na",
"com.nc",
"net.nc",
"org.nc",
"ac.ng",
"edu.ng",
"sch.ng",
"com.ng",
"gov.ng",
"org.ng",
"net.ng",
"gob.ni",
"com.ni",
"net.ni",
"edu.ni",
"nom.ni",
"org.ni",
"com.np",
"net.np",
"org.np",
"gov.np",
"edu.np",
"ac.nz",
"co.nz",
"cri.nz",
"gen.nz",
"geek.nz",
"govt.nz",
"iwi.nz",
"maori.nz",
"mil.nz",
"net.nz",
"org.nz",
"school.nz",
"com.om",
"co.om",
"edu.om",
"ac.om",
"gov.om",
"net.om",
"org.om",
"mod.om",
"museum.om",
"biz.om",
"pro.om",
"med.om",
"com.pa",
"net.pa",
"org.pa",
"edu.pa",
"ac.pa",
"gob.pa",
"sld.pa",
"edu.pe",
"gob.pe",
"nom.pe",
"mil.pe",
"org.pe",
"com.pe",
"net.pe",
"com.pg",
"net.pg",
"ac.pg",
"com.ph",
"net.ph",
"org.ph",
"mil.ph",
"ngo.ph",
"aid.pl",
"agro.pl",
"atm.pl",
"auto.pl",
"biz.pl",
"com.pl",
"edu.pl",
"gmina.pl",
"gsm.pl",
"info.pl",
"mail.pl",
"miasta.pl",
"media.pl",
"mil.pl",
"net.pl",
"nieruchomosci.pl",
"nom.pl",
"org.pl",
"pc.pl",
"powiat.pl",
"priv.pl",
"realestate.pl",
"rel.pl",
"sex.pl",
"shop.pl",
"sklep.pl",
"sos.pl",
"szkola.pl",
"targi.pl",
"tm.pl",
"tourism.pl",
"travel.pl",
"turystyka.pl",
"com.pk",
"net.pk",
"edu.pk",
"org.pk",
"fam.pk",
"biz.pk",
"web.pk",
"gov.pk",
"gob.pk",
"gok.pk",
"gon.pk",
"gop.pk",
"gos.pk",
"edu.ps",
"gov.ps",
"plo.ps",
"sec.ps",
"com.py",
"net.py",
"org.py",
"edu.py",
"com.qa",
"net.qa",
"org.qa",
"edu.qa",
"gov.qa",
"asso.re",
"com.re",
"nom.re",
"com.ru",
"net.ru",
"org.ru",
"pp.ru",
"com.sa",
"edu.sa",
"sch.sa",
"med.sa",
"gov.sa",
"net.sa",
"org.sa",
"pub.sa",
"com.sb",
"net.sb",
"org.sb",
"edu.sb",
"gov.sb",
"com.sd",
"net.sd",
"org.sd",
"edu.sd",
"sch.sd",
"med.sd",
"gov.sd",
"tm.se",
"press.se",
"parti.se",
"brand.se",
"fh.se",
"fhsk.se",
"fhv.se",
"komforb.se",
"kommunalforbund.se",
"komvux.se",
"lanarb.se",
"lanbib.se",
"naturbruksgymn.se",
"sshn.se",
"org.se",
"pp.se",
"com.sg",
"net.sg",
"org.sg",
"edu.sg",
"gov.sg",
"per.sg",
"com.sh",
"net.sh",
"org.sh",
"edu.sh",
"gov.sh",
"mil.sh",
"gov.st",
"saotome.st",
"principe.st",
"consulado.st",
"embaixada.st",
"org.st",
"edu.st",
"net.st",
"com.st",
"store.st",
"mil.st",
"co.st",
"com.sv",
"org.sv",
"edu.sv",
"gob.sv",
"red.sv",
"com.sy",
"net.sy",
"org.sy",
"gov.sy",
"ac.th",
"co.th",
"go.th",
"net.th",
"or.th",
"com.tn",
"net.tn",
"org.tn",
"edunet.tn",
"gov.tn",
"ens.tn",
"fin.tn",
"nat.tn",
"ind.tn",
"info.tn",
"intl.tn",
"rnrt.tn",
"rnu.tn",
"rns.tn",
"tourism.tn",
"com.tr",
"net.tr",
"org.tr",
"edu.tr",
"gov.tr",
"mil.tr",
"bbs.tr",
"k12.tr",
"gen.tr",
"co.tt",
"com.tt",
"org.tt",
"net.tt",
"biz.tt",
"info.tt",
"pro.tt",
"name.tt",
"gov.tt",
"edu.tt",
"nic.tt",
"us.tt",
"uk.tt",
"ca.tt",
"eu.tt",
"es.tt",
"fr.tt",
"it.tt",
"se.tt",
"dk.tt",
"be.tt",
"de.tt",
"at.tt",
"au.tt",
"co.tv",
"com.tw",
"net.tw",
"org.tw",
"edu.tw",
"idv.tw",
"gove.tw",
"com.ua",
"net.ua",
"org.ua",
"edu.ua",
"gov.ua",
"ac.ug",
"co.ug",
"or.ug",
"go.ug",
"co.uk",
"me.uk",
"org.uk",
"edu.uk",
"ltd.uk",
"plc.uk",
"net.uk",
"sch.uk",
"nic.uk",
"ac.uk",
"gov.uk",
"nhs.uk",
"police.uk",
"mod.uk",
"dni.us",
"fed.us",
"com.uy",
"edu.uy",
"net.uy",
"org.uy",
"gub.uy",
"mil.uy",
"com.ve",
"net.ve",
"org.ve",
"co.ve",
"edu.ve",
"gov.ve",
"mil.ve",
"arts.ve",
"bib.ve",
"firm.ve",
"info.ve",
"int.ve",
"nom.ve",
"rec.ve",
"store.ve",
"tec.ve",
"web.ve",
"co.vi",
"net.vi",
"org.vi",
"com.vn",
"biz.vn",
"edu.vn",
"gov.vn",
"net.vn",
"org.vn",
"int.vn",
"ac.vn",
"pro.vn",
"info.vn",
"health.vn",
"name.vn",
"com.vu",
"edu.vu",
"net.vu",
"org.vu",
"de.vu",
"ch.vu",
"fr.vu",
"com.ws",
"net.ws",
"org.ws",
"gov.ws",
"edu.ws",
"ac.yu",
"co.yu",
"edu.yu",
"org.yu",
"com.ye",
"net.ye",
"org.ye",
"gov.ye",
"edu.ye",
"mil.ye",
"ac.za",
"alt.za",
"bourse.za",
"city.za",
"co.za",
"edu.za",
"gov.za",
"law.za",
"mil.za",
"net.za",
"ngo.za",
"nom.za",
"org.za",
"school.za",
"tm.za",
"web.za",
"co.zw",
"ac.zw",
"org.zw",
"gov.zw",
"eu.org",
"au.com",
"br.com",
"cn.com",
"de.com",
"de.net",
"eu.com",
"gb.com",
"gb.net",
"hu.com",
"no.com",
"qc.com",
"ru.com",
"sa.com",
"se.com",
"uk.com",
"uk.net",
"us.com",
"uy.com",
"za.com",
"dk.org",
"tel.no",
"fax.nr",
"mob.nr",
"mobil.nr",
"mobile.nr",
"tel.nr",
"tlf.nr",
"e164.arpa"
};
debugOut("array size=" + tmp.length);
return tmp;
}
/**
* Debugging output
*/
private static void debugOut(String msg) {
if (true == testing) {
System.out.println(msg);
}
}
/**
* Debugging output
*/
private static void debugOut(Throwable th) {
if (true == testing) {
System.out.println(th);
}
}
/**
* Test driver
*/
public static void main(String args[]) {
testing = true;
String[] test2 = new String[] {
"woof.com",
"co.uk",
"lkd.co.im",
"gov.qa",
};
String[] test3 = new String[] {
"woof.woof.com",
"lkd.co.im",
"gov.qa",
};
int i;
debugOut("2 part TLDs --------");
for (i=0; i<test2.length; i++) {
debugOut(test2[i]+" found is: "+isTwoPartTLD(test2[i]));
}
debugOut("3 part TLDs --------");
for (i=0; i<test3.length; i++) {
debugOut(test3[i]+" found is: "+isThreePartTLD(test3[i]));
}
}
}
---------------------------------------------------------------------
To unsubscribe, e-mail: server-dev-unsubscribe@james.apache.org
For additional commands, e-mail: server-dev-help@james.apache.org
[prev in list] [next in list] [prev in thread] [next in thread]
Configure |
About |
News |
Add a list |
Sponsored by KoreLogic