[prev in list] [next in list] [prev in thread] [next in thread] 

List:       spamassassin-devel
Subject:    Re: svn commit: r1880308 - /spamassassin/trunk/masses/hit-frequencies
From:       "Kevin A. McGrail" <kmcgrail () apache ! org>
Date:       2020-07-26 14:07:44
Message-ID: CAMMMAUH7YZiLrfcmhzZ1Gz5PnGU+pQtRFJOphwJ7qoVkBfizEg () mail ! gmail ! com
[Download RAW message or body]

Nice!

On Sun, Jul 26, 2020, 01:50 <hege@apache.org> wrote:

> Author: hege
> Date: Sun Jul 26 05:50:00 2020
> New Revision: 1880308
> 
> URL: http://svn.apache.org/viewvc?rev=1880308&view=rev
> Log:
> Tweaks to increase speed, cut runtime in half
> 
> Modified:
> spamassassin/trunk/masses/hit-frequencies
> 
> Modified: spamassassin/trunk/masses/hit-frequencies
> URL:
> http://svn.apache.org/viewvc/spamassassin/trunk/masses/hit-frequencies?rev=1880308&r1=1880307&r2=1880308&view=diff
>  
> ==============================================================================
> --- spamassassin/trunk/masses/hit-frequencies (original)
> +++ spamassassin/trunk/masses/hit-frequencies Sun Jul 26 05:50:00 2020
> @@ -805,52 +805,48 @@ sub compute_overlaps_for_rule {
> my %overlaps_ham1r = ();
> my %overlaps_spam1r = ();
> 
> -  foreach my $r2 (keys %hmap_spam) {
> -    next if $r1 eq $r2;
> -
> -    # require that both rules have at least 1 hit
> -    next unless ($freq_spam{$r1} && $freq_spam{$r2});
> -
> -    my ($a1ina2, $a2ina1) = _hmap_to_overlap_ratio ($r2, $r1,
> -                                    $hmap_spam{$r2}, $hmap_spam{$r1});
> -
> -    if ($a1ina2 > 0)
> -    {
> -      $overlaps_spam1r{$r2} = $a1ina2;
> -
> -      if (exists $overlaps_spam1{$a1ina2})
> -      { $overlaps_spam1{$a1ina2} .= " ".$r2."[$a2ina1]"; }
> -      else { $overlaps_spam1{$a1ina2} = $r2."[$a2ina1]"; }
> -
> -      if (exists $overlaps_spam2{$a2ina1})
> -      { $overlaps_spam2{$a2ina1} .= " ".$r2."[$a2ina1]"; }
> -      else { $overlaps_spam2{$a2ina1} = $r2."[$a2ina1]"; }
> +  if ($freq_spam{$r1}) {
> +    foreach my $r2 (keys %hmap_spam) {
> +      next if $r1 eq $r2;
> +
> +      my ($a1ina2, $a2ina1) = _hmap_to_overlap_ratio ($r2, $r1,
> +                                      $hmap_spam{$r2}, $hmap_spam{$r1});
> +
> +      if ($a1ina2 > 0)
> +      {
> +        $overlaps_spam1r{$r2} = $a1ina2;
> +
> +        if (exists $overlaps_spam1{$a1ina2})
> +        { $overlaps_spam1{$a1ina2} .= " ".$r2."[$a2ina1]"; }
> +        else { $overlaps_spam1{$a1ina2} = $r2."[$a2ina1]"; }
> +
> +        if (exists $overlaps_spam2{$a2ina1})
> +        { $overlaps_spam2{$a2ina1} .= " ".$r2."[$a2ina1]"; }
> +        else { $overlaps_spam2{$a2ina1} = $r2."[$a2ina1]"; }
> +      }
> }
> -
> }
> 
> -  foreach my $r2 (keys %hmap_ham) {
> -    next if $r1 eq $r2;
> -
> -    # require that both rules have at least 1 hit
> -    next unless ($freq_ham{$r1} && $freq_ham{$r2});
> -
> -    my ($a1ina2, $a2ina1) = _hmap_to_overlap_ratio ($r1, $r2,
> -                                    $hmap_ham{$r2}, $hmap_ham{$r1});
> -
> -    if ($a1ina2 > 0)
> -    {
> -      $overlaps_ham1r{$r2} = $a1ina2;
> -
> -      if (exists $overlaps_ham1{$a1ina2})
> -      { $overlaps_ham1{$a1ina2} .= " ".$r2."[$a2ina1]"; }
> -      else { $overlaps_ham1{$a1ina2} = $r2."[$a2ina1]"; }
> -
> -      if (exists $overlaps_ham2{$a2ina1})
> -      { $overlaps_ham2{$a2ina1} .= " ".$r2."[$a1ina2]"; }
> -      else { $overlaps_ham2{$a2ina1} = $r2."[$a1ina2]"; }
> +  if ($freq_ham{$r1}) {
> +    foreach my $r2 (keys %hmap_ham) {
> +      next if $r1 eq $r2;
> +
> +      my ($a1ina2, $a2ina1) = _hmap_to_overlap_ratio ($r1, $r2,
> +                                      $hmap_ham{$r2}, $hmap_ham{$r1});
> +
> +      if ($a1ina2 > 0)
> +      {
> +        $overlaps_ham1r{$r2} = $a1ina2;
> +
> +        if (exists $overlaps_ham1{$a1ina2})
> +        { $overlaps_ham1{$a1ina2} .= " ".$r2."[$a2ina1]"; }
> +        else { $overlaps_ham1{$a1ina2} = $r2."[$a2ina1]"; }
> +
> +        if (exists $overlaps_ham2{$a2ina1})
> +        { $overlaps_ham2{$a2ina1} .= " ".$r2."[$a1ina2]"; }
> +        else { $overlaps_ham2{$a2ina1} = $r2."[$a1ina2]"; }
> +      }
> }
> -
> }
> 
> _print_overlap_ratios($r1, \%overlaps_spam1, \%overlaps_spam2, "spam",
> \%overlaps_ham1r, "ham");
> @@ -934,25 +930,23 @@ sub _prettify_overlap_rules {
> sub _hmap_to_overlap_ratio {
> my ($r1, $r2, $hmap1, $hmap2) = @_;
> 
> -  $hmap1 ||= '';
> -  $hmap2 ||= '';
> -  if ($hmap1 !~ /[^\000]/ || $hmap2 !~ /[^\000]/) {
> -    # no hits on either! this would normally give a 100% hitrate match,
> -    # but that's misleading -- so hide it by giving it a 0% overlap.
> -    #
> -    # also, ignore cases where there are no hits on *one* of the rules,
> -    # while there are hits on the other -- after all, if one rule doesn't
> -    # have a single hit, it cannot overlap.
> -    #
> -    return (0,0);
> -  }
> -
> # my $i; for ($i = 0; $i < length($hmap1)*8; $i++) { print
> vec($hmap1,$i,1); } print "\n"; for ($i = 0; $i < length($hmap2)*8; $i++) {
> print vec($hmap2,$i,1); } print "\n";
> 
> # count bits in each, so we can show when one is fully subsumed by
> another
> # with perl's support for bitstring ops, we get C speed here, nice!
> +
> +  # no hits on either? this would normally give a 100% hitrate match,
> +  # but that's misleading -- so hide it by giving it a 0% overlap.
> +  #
> +  # also, ignore cases where there are no hits on *one* of the rules,
> +  # while there are hits on the other -- after all, if one rule doesn't
> +  # have a single hit, it cannot overlap.
> +
> my $a1 = unpack("%32b*", $hmap1);
> +  return (0,0) unless $a1;
> my $a2 = unpack("%32b*", $hmap2);
> +  return (0,0) unless $a2;
> +
> my $a1_and_a2 = unpack("%32b*", ($hmap1 & $hmap2));
> 
> # round rather than truncate
> 
> 
> 


[Attachment #3 (text/html)]

<div dir="auto">Nice!</div><br><div class="gmail_quote"><div dir="ltr" \
class="gmail_attr">On Sun, Jul 26, 2020, 01:50  &lt;<a \
href="mailto:hege@apache.org">hege@apache.org</a>&gt; wrote:<br></div><blockquote \
class="gmail_quote" style="margin:0 0 0 .8ex;border-left:1px #ccc \
                solid;padding-left:1ex">Author: hege<br>
Date: Sun Jul 26 05:50:00 2020<br>
New Revision: 1880308<br>
<br>
URL: <a href="http://svn.apache.org/viewvc?rev=1880308&amp;view=rev" rel="noreferrer \
noreferrer" target="_blank">http://svn.apache.org/viewvc?rev=1880308&amp;view=rev</a><br>
 Log:<br>
Tweaks to increase speed, cut runtime in half<br>
<br>
Modified:<br>
      spamassassin/trunk/masses/hit-frequencies<br>
<br>
Modified: spamassassin/trunk/masses/hit-frequencies<br>
URL: <a href="http://svn.apache.org/viewvc/spamassassin/trunk/masses/hit-frequencies?rev=1880308&amp;r1=1880307&amp;r2=1880308&amp;view=diff" \
rel="noreferrer noreferrer" \
target="_blank">http://svn.apache.org/viewvc/spamassassin/trunk/masses/hit-frequencies?rev=1880308&amp;r1=1880307&amp;r2=1880308&amp;view=diff</a><br>
 ==============================================================================<br>
--- spamassassin/trunk/masses/hit-frequencies (original)<br>
+++ spamassassin/trunk/masses/hit-frequencies Sun Jul 26 05:50:00 2020<br>
@@ -805,52 +805,48 @@ sub compute_overlaps_for_rule {<br>
     my %overlaps_ham1r = ();<br>
     my %overlaps_spam1r = ();<br>
<br>
-   foreach my $r2 (keys %hmap_spam) {<br>
-      next if $r1 eq $r2;<br>
-<br>
-      # require that both rules have at least 1 hit<br>
-      next unless ($freq_spam{$r1} &amp;&amp; $freq_spam{$r2});<br>
-<br>
-      my ($a1ina2, $a2ina1) = _hmap_to_overlap_ratio ($r2, $r1,<br>
-                                                      $hmap_spam{$r2}, \
                $hmap_spam{$r1});<br>
-<br>
-      if ($a1ina2 &gt; 0)<br>
-      {<br>
-         $overlaps_spam1r{$r2} = $a1ina2;<br>
-<br>
-         if (exists $overlaps_spam1{$a1ina2})<br>
-         { $overlaps_spam1{$a1ina2} .= &quot; &quot;.$r2.&quot;[$a2ina1]&quot;; \
                }<br>
-         else { $overlaps_spam1{$a1ina2} = $r2.&quot;[$a2ina1]&quot;; }<br>
-<br>
-         if (exists $overlaps_spam2{$a2ina1})<br>
-         { $overlaps_spam2{$a2ina1} .= &quot; &quot;.$r2.&quot;[$a2ina1]&quot;; \
                }<br>
-         else { $overlaps_spam2{$a2ina1} = $r2.&quot;[$a2ina1]&quot;; }<br>
+   if ($freq_spam{$r1}) {<br>
+      foreach my $r2 (keys %hmap_spam) {<br>
+         next if $r1 eq $r2;<br>
+<br>
+         my ($a1ina2, $a2ina1) = _hmap_to_overlap_ratio ($r2, $r1,<br>
+                                                         $hmap_spam{$r2}, \
$hmap_spam{$r1});<br> +<br>
+         if ($a1ina2 &gt; 0)<br>
+         {<br>
+            $overlaps_spam1r{$r2} = $a1ina2;<br>
+<br>
+            if (exists $overlaps_spam1{$a1ina2})<br>
+            { $overlaps_spam1{$a1ina2} .= &quot; &quot;.$r2.&quot;[$a2ina1]&quot;; \
}<br> +            else { $overlaps_spam1{$a1ina2} = $r2.&quot;[$a2ina1]&quot;; }<br>
+<br>
+            if (exists $overlaps_spam2{$a2ina1})<br>
+            { $overlaps_spam2{$a2ina1} .= &quot; &quot;.$r2.&quot;[$a2ina1]&quot;; \
}<br> +            else { $overlaps_spam2{$a2ina1} = $r2.&quot;[$a2ina1]&quot;; }<br>
+         }<br>
        }<br>
-<br>
     }<br>
<br>
-   foreach my $r2 (keys %hmap_ham) {<br>
-      next if $r1 eq $r2;<br>
-<br>
-      # require that both rules have at least 1 hit<br>
-      next unless ($freq_ham{$r1} &amp;&amp; $freq_ham{$r2});<br>
-<br>
-      my ($a1ina2, $a2ina1) = _hmap_to_overlap_ratio ($r1, $r2,<br>
-                                                      $hmap_ham{$r2}, \
                $hmap_ham{$r1});<br>
-<br>
-      if ($a1ina2 &gt; 0)<br>
-      {<br>
-         $overlaps_ham1r{$r2} = $a1ina2;<br>
-<br>
-         if (exists $overlaps_ham1{$a1ina2})<br>
-         { $overlaps_ham1{$a1ina2} .= &quot; &quot;.$r2.&quot;[$a2ina1]&quot;; }<br>
-         else { $overlaps_ham1{$a1ina2} = $r2.&quot;[$a2ina1]&quot;; }<br>
-<br>
-         if (exists $overlaps_ham2{$a2ina1})<br>
-         { $overlaps_ham2{$a2ina1} .= &quot; &quot;.$r2.&quot;[$a1ina2]&quot;; }<br>
-         else { $overlaps_ham2{$a2ina1} = $r2.&quot;[$a1ina2]&quot;; }<br>
+   if ($freq_ham{$r1}) {<br>
+      foreach my $r2 (keys %hmap_ham) {<br>
+         next if $r1 eq $r2;<br>
+<br>
+         my ($a1ina2, $a2ina1) = _hmap_to_overlap_ratio ($r1, $r2,<br>
+                                                         $hmap_ham{$r2}, \
$hmap_ham{$r1});<br> +<br>
+         if ($a1ina2 &gt; 0)<br>
+         {<br>
+            $overlaps_ham1r{$r2} = $a1ina2;<br>
+<br>
+            if (exists $overlaps_ham1{$a1ina2})<br>
+            { $overlaps_ham1{$a1ina2} .= &quot; &quot;.$r2.&quot;[$a2ina1]&quot;; \
}<br> +            else { $overlaps_ham1{$a1ina2} = $r2.&quot;[$a2ina1]&quot;; }<br>
+<br>
+            if (exists $overlaps_ham2{$a2ina1})<br>
+            { $overlaps_ham2{$a2ina1} .= &quot; &quot;.$r2.&quot;[$a1ina2]&quot;; \
}<br> +            else { $overlaps_ham2{$a2ina1} = $r2.&quot;[$a1ina2]&quot;; }<br>
+         }<br>
        }<br>
-<br>
     }<br>
<br>
     _print_overlap_ratios($r1, \%overlaps_spam1, \%overlaps_spam2, &quot;spam&quot;, \
\%overlaps_ham1r, &quot;ham&quot;);<br> @@ -934,25 +930,23 @@ sub \
_prettify_overlap_rules {<br>  sub _hmap_to_overlap_ratio {<br>
     my ($r1, $r2, $hmap1, $hmap2) = @_;<br>
<br>
-   $hmap1 ||= &#39;&#39;;<br>
-   $hmap2 ||= &#39;&#39;;<br>
-   if ($hmap1 !~ /[^\000]/ || $hmap2 !~ /[^\000]/) {<br>
-      # no hits on either! this would normally give a 100% hitrate match,<br>
-      # but that&#39;s misleading -- so hide it by giving it a 0% overlap.<br>
-      #<br>
-      # also, ignore cases where there are no hits on *one* of the rules,<br>
-      # while there are hits on the other -- after all, if one rule doesn&#39;t<br>
-      # have a single hit, it cannot overlap.<br>
-      #<br>
-      return (0,0);<br>
-   }<br>
-<br>
     # my $i; for ($i = 0; $i &lt; length($hmap1)*8; $i++) { print vec($hmap1,$i,1); \
} print &quot;\n&quot;; for ($i = 0; $i &lt; length($hmap2)*8; $i++) { print \
vec($hmap2,$i,1); } print &quot;\n&quot;;<br> <br>
     # count bits in each, so we can show when one is fully subsumed by another<br>
     # with perl&#39;s support for bitstring ops, we get C speed here, nice!<br>
+<br>
+   # no hits on either? this would normally give a 100% hitrate match,<br>
+   # but that&#39;s misleading -- so hide it by giving it a 0% overlap.<br>
+   #<br>
+   # also, ignore cases where there are no hits on *one* of the rules,<br>
+   # while there are hits on the other -- after all, if one rule doesn&#39;t<br>
+   # have a single hit, it cannot overlap.<br>
+<br>
     my $a1 = unpack(&quot;%32b*&quot;, $hmap1);<br>
+   return (0,0) unless $a1;<br>
     my $a2 = unpack(&quot;%32b*&quot;, $hmap2);<br>
+   return (0,0) unless $a2;<br>
+<br>
     my $a1_and_a2 = unpack(&quot;%32b*&quot;, ($hmap1 &amp; $hmap2));<br>
<br>
     # round rather than truncate<br>
<br>
<br>
</blockquote></div>



[prev in list] [next in list] [prev in thread] [next in thread] 

Configure | About | News | Add a list | Sponsored by KoreLogic