[prev in list] [next in list] [prev in thread] [next in thread] 

List:       sas-l
Subject:    SAS-L: Preventing data disclosure by setting cells with values less than 5 to missing(not robust)
From:       Roger DeAngelis <rogerjdeangelis () GMAIL ! COM>
Date:       2017-06-29 18:39:59
Message-ID: 5972017915751533.WA.rogerjdeangelisgmail.com () listserv ! uga ! edu
[Download RAW message or body]

SAS-L: Preventing data disclosure by setting cells with values less than 5 to missing(not robust)

But might be enough, depends on risk?

   Suggest you look ar R packages(I did not have thta option)

      sdcMicro
      sdcTable

   WPS/SAS/R
   =========

       WORKING CODE
          havmis<-havsub[havsub < 5] <- NA;  * set all <5 to missing

          rowmis<-rowSums(is.na(havsub));  * get count of row and col missings;
          colmis<-colSums(is.na(havsub));

          * if missing count=1 in a column then set next smallest to missing;
          for(i in 1:ncol(havsub)){
               if ( colmis[i]==1)  {
                  havsub[(havsub[,i]==min(havsub[,i],na.rm=TRUE)), i] <- NA;}};

          * if missing count=1 in a row ten set next smallest to missing;
          for(i in 1:nrow(havsub)){
            if ( rowmis[i]==1)  {
              havsub[i,havsub[i,]==min(havsub[i,],na.rm=TRUE)] <- NA;}};


      SAS   ( too long to show code - similar to R)

       ROWS
         set all cells<5 to missing
         check in 1 missing set next lowest to missing
         Double tranpose to pivit rows to columns

       COLS (using pivoted data - same code as rows)
         set all cells<5 to missing
         check in 1 missing set next lowest to missing
         Double transpose to pivot rows to columns (original form)


     * For those of you who want to supplament your SAS with R this
       is a good learning example


This type of problem is best solved using R, Python or IML?

The idea is to make it difficult to uncover the missing cells.

Because we have column and row totals,
if only one cell in a column or row is missing, the
missing cell can be found using TOT - SUM(all non missing cells in a row or column).
If there is just one missing cell in any column or row we
need to set another to missing. I choose the next smallest to set to missing.


HAVE
====

Up to 40 obs WORK.HAVE total obs=6

 ROW    X1    X2     X3    X4    TOT

  1      1    18     12    19     50
  2     22    28     25     4     79
  3     12    15     16    10     53
  4     25     2     22     7     56
  5     11    30     26     4     71

  0     71    93    101    44    324  ==> this is total


WANT
====

Obs    ROW    X1    X2     X3    X4       TOT

 1      1      .    18      .    19        50
 2      2      .    28     25     .        79
 3      3     12     .     16    10        53
 4      4     25     .     22     .        50
 5      5      .    30      .     .        71

 6      0     71    93    101    44       324

DETAILS
=======

Blindly set all values<5 in the x1-x4 submatrix to missing

Lets look at a X1 and some rows where X1 is missing


    X1      X2       X3      X4       TOT

     .  1   18 18     . 12   19 19     50  (need at least two in a row 1<5 and 12 is the next smallest)
     . 22   28 28    25 25    . 4      79  (need at least two in a row 4<5 and 22 is the next smallest)

    12 12
    25 25

     . 11   30 30     . 26    . 4      71  (need at least two in a row 4<5 and 11 is the next smallest)

    71

*                _              _       _
 _ __ ___   __ _| | _____    __| | __ _| |_ __ _
| '_ ` _ \ / _` | |/ / _ \  / _` |/ _` | __/ _` |
| | | | | | (_| |   <  __/ | (_| | (_| | || (_| |
|_| |_| |_|\__,_|_|\_\___|  \__,_|\__,_|\__\__,_|

;

options validvarname=upcase;
libname sd1 "d:/sd1";
data sd1.have;
input ROW X1 X2 X3 X4 TOT;
cards4;
 1      1    18    12    19     50
 2     22    28    25     4     79
 3     12    15    16    10     53
 4     25     2    22     1     50
 5     11    30    26     4     71
 0     71    93   101    44    324
;;;;
run;quit;

*          _       _   _
 ___  ___ | |_   _| |_(_) ___  _ __
/ __|/ _ \| | | | | __| |/ _ \| '_ \
\__ \ (_) | | |_| | |_| | (_) | | | |
|___/\___/|_|\__,_|\__|_|\___/|_| |_|

;

%utl_submit_wps64('
libname sd1 "d:/sd1";
options set=R_HOME "C:/Program Files/R/R-3.3.2";
libname wrk "%sysfunc(pathname(work))";
proc r;
submit;
source("c:/Program Files/R/R-3.3.2/etc/Rprofile.site",echo=T);
library(haven);
have<-as.matrix(read_sas("d:/sd1/have.sas7bdat"));
havsub<-have[-nrow(have),-ncol(have)][,-1];
havmis<-havsub[havsub < 5] <- NA;
rowmis<-rowSums(is.na(havsub));
colmis<-colSums(is.na(havsub));
for(i in 1:ncol(havsub)){
  if ( colmis[i]==1)  {
    havsub[(havsub[,i]==min(havsub[,i],na.rm=TRUE)), i] <- NA;
  }
};
havsub;
for(i in 1:nrow(havsub)){
  if ( rowmis[i]==1)  {
    havsub[i,havsub[i,]==min(havsub[i,],na.rm=TRUE)] <- NA;
  }
};
havsub<-cbind(have[,1],havsub,have[,ncol(have)]);
havsub<-rbind(havsub,have[nrow(have),]);
colnames(havsub)<-colnames(have);
havsub<-as.data.frame(havsub);
endsubmit;
import r=havsub data=wrk.wantwps;
run;quit;
');

NOTE: Processing of R statements complete

23        import r=havsub data=wrk.wantwps;
NOTE: Creating data set 'WRK.wantwps' from R data frame 'havsub'
NOTE: Data set "WRK.wantwps" has 6 observation(s) and 6 variable(s)

24        run;
NOTE: Procedure r step took :
      real time : 0.668
      cpu time  : 0.015



libname sd1 "d:/sd1";


options missing=' ';
 data hav1st(keep=row x1-x4 where=(row ne 0));
    retain cnt 0;
    set sd1.have;;
    array cels[4] x1-x4;
    do cel=1 to dim(cels);
       if cels[cel]<=5 and cels[cel] > 0 then do;
              cels[cel]=.;
       end;
    end;
     do cel=1 to dim(cels);
       if cels[cel]=. then cnt=cnt+1;
    end;
    if cnt=1 then do;
       do cel=1 to dim(cels);
          if cels[cel]=. then do;
             min_value=min(of cels(*));
             pos=whichn(min_value, of cels(*));
             cels[pos]=.;
             leave;
          end;
       end;
    end;
    cnt=0;
run;quit;

* pivot so rows are across;
proc transpose data=hav1st out=hav1stxpo(rename=_name_=var);
by row;
run;quit;

proc sort data=hav1stxpo out=hav1stsrt;
by var;
run;quit;

proc transpose data=hav1stsrt out=hav1stxxp(drop=_name_) prefix=x;
by var;
id row;
run;quit;

options missing=' ';
 data havpiv(keep=var x1-x5);
    retain cnt 0;
    set hav1stxxp;;
    array cels[5] x1-x5;
    do cel=1 to dim(cels);
       if cels[cel]<=5 and cels[cel] > 0 then do;
              cels[cel]=.;
       end;
    end;
     do cel=1 to dim(cels);
       if cels[cel]=. then cnt=cnt+1;
    end;
    if cnt=1 then do;
       do cel=1 to dim(cels);
          if cels[cel]=. then do;
             min_value=min(of cels(*));
             pos=whichn(min_value, of cels(*));
             cels[pos]=.;
             leave;
          end;
       end;
    end;
    cnt=0;
run;quit;


* pivot again;
proc transpose data=havpiv out=havpivxpo;
by var;
run;quit;

proc sort data=havpivxpo out=havpivsrt;
by _name_;
run;quit;

proc transpose data=havpivsrt out=havpivxxp(drop=_name_) prefix=x;
by _name_;
run;quit;
[prev in list] [next in list] [prev in thread] [next in thread] 

Configure | About | News | Add a list | Sponsored by KoreLogic