[prev in list] [next in list] [prev in thread] [next in thread] 

List:       sas-l
Subject:    SAS-L: Thirteen algorithms to split a table based on groups of data;
From:       Roger DeAngelis <rogerjdeangelis () GMAIL ! COM>
Date:       2018-06-27 20:18:51
Message-ID: 4268863177530637.WA.rogerjdeangelisgmail.com () listserv ! uga ! edu
[Download RAW message or body]

SAS-L: Thirteen algorithms to split a table based on groups of data;

github
https://tinyurl.com/y6uzqbm2
https://github.com/rogerjdeangelis/utl_thirteen_algorithms_to_split_a_table_based_on_groups_of_data


  Tweleve Splitting Solutions

     1. Hash without sort Paul Dorfman
     2. Hash without sort Low memory Paul Dorfman
        (amazing only a Hash and open=defer can create dynamic datasets)
        (Hash should be able to create a defered new dataset within a datastep
         i.e. set x y open=defer where y is created by the Hash)
     3. Dosubl SQL then Datastep
     4. Dosubl Sort and index then Datastep
     5. Dosubl Just datastep  (very inefficient)
     6. R split function
     7. Dosubl datastep inside proc sql (kind of useless)
     8. SQL array
     9. Datstep and macro (Chriss Hemedinger)
    10. Dosubl in one SQL selct statement (Variation of 9)
    11. Call execute large dataset solution?
    12  Split datasst on columns
    13  Split dataset on rows


see
https://goo.gl/4QL1o7
https://communities.sas.com/t5/SAS-Statistical-Procedures/How-to-quot-Split-Data-quot-By-Group-Processing/m-p/431439


see
Novinosrin profile
https://communities.sas.com/t5/user/viewprofilepage/user-id/138205


INPUT
=====

 WORK.HAVE total obs=27                 |  RULES  (NW AND ow DATASETS)
                                        |
   WEIGHT    ID    TREATMENT    KCAL    |  WORK.NW total obs=15
                                        |
     NW      1         A         400    |    WEIGHT    ID    TREATMENT    KCAL
     NW      2         A         500    |
     OW      3         A         560    |      NW      1         A         400
     NW      4         A         800    |      NW      2         A         500
     OW      5         A         490    |      NW      4         A         800
     NW      6         A         500    |      NW      6         A         500
     OW      7         A         400    |    ...
                                        |  WORK.OW total obs=12
                                        |
                                        |    WEIGHT    ID    TREATMENT    KCAL
                                        |
                                        |      OW      3         A         560
                                        |      OW      5         A         490
                                        |      OW      7         A         400


PROCESS ( All the code)
=======================

 1. HASH without sort Paul Dorfman (this uses sashelp.citimon dataset)

    data _null_ ;
      dcl hash hh (ordered:"a") ;
      hh.definekey  ("m") ;
      hh.definedata ("mon", "h") ;
      hh.definedone () ;
      do until (z) ;
        set sashelp.citimon end = z ;
        m = month (date) ;
        mon = put (date, worddate3.) ;
        if hh.find() ne 0 then do ;
          dcl hash h (dataset:"sashelp.citimon(obs=0)", multidata:"y") ;
          h.definekey ("date") ;
          h.definedata (all:"y") ;
          h.definedone () ;
          hh.add() ;
        end ;
        h.add() ;
      end ;
      dcl hiter ihh ("hh") ;
      do while (ihh.next() = 0) ;
        h.output (dataset:mon) ;
      end ;
      stop ;
    run ;

    NOTE: The data set WORK.JAN has 13 observations and 19 variables.
    NOTE: The data set WORK.FEB has 12 observations and 19 variables.
    NOTE: The data set WORK.MAR has 12 observations and 19 variables.
    ...
    NOTE: The data set WORK.DEC has 12 observations and 19 variables.


 2. HASH without sort Low memory Paul Dorfman
 ============================================

    Interesting dynamic input and output datasets;

    1. It illustrates a curious angle of the dynamic nature of the hash object.
    2. I haven't seen it done this way before.

    data _null_ ;
      do i = 0 to 11 ;
        d = intnx("mon",0,i) ;
        dcl hash h (dataset:cats("sashelp.citimon(where=(month(date)=",month(d),"))"),multidata:"y") \
;  h.definekey ("date") ;
        h.definedata (all:"y") ;
        h.definedone() ;
        h.output (dataset:put(d,worddate3.)) ;
        h.clear() ;
      end ;
      stop ;
      set sashelp.citimon ;
    run ;


 3. DOSUBL SQL then Datastep
 ===========================

    data _null_;

      * faster than distinct;
      if _n_=0 then do;
        %let rc=%sysfunc(dosubl('
          proc sql;
             select quote(max(weight)) into :wgts separated by "," from have  group \
by weight  ;quit;
        '));
      end;

      do wgt=&wgts;
        call symputx("wgt",wgt);
        rc=dosubl('
          data &wgt;
            set have(where=(weight=symget("wgt")));
          run;quit;
        ');
      end;

    run;quit;


 4. SORT DOSUBL (need an index on weight)
 =======================================

    data _null_;

      if _n_=0 then do;
        %let rc=%sysfunc(dosubl('
          proc sort data=have out=havSrt(index=(weight));
             by weight;
          run;quit;
        '));
      end;

      set havSrt(keep=weight);
      by weight;
      if last.weight then do;
         call symputx('wgt',weight);
         rc=dosubl('
            data &wgt;
               set have(where=(weight=symget("wgt")));
            run;quit;
        ');
      end;
    run;quit;


 5. DOSUBL JUST DATASTEP
 =======================

    * just in case they exist;
    proc datasets lib=work;
      delete ow nw rec;
    run;quit;

    data _null_;

      set have(keep=weight);
      call symputx('wgt',weight);
      call symputx('rec',_n_);

      * need to optimizing compiler;
      rc=dosubl('
        data rec; set have; if _n_=&rec then do; output;stop;end; run;quit;
        proc append base=&wgt data=rec;
        run;quit;
      ');

    run;quit;


 6. R SPLIT FUNCTION
 ===================

    %utl_submit_wps64('
    libname sd1 sas7bdat "d:/sd1";
    options set=R_HOME "C:/Program Files/R/R-3.3.2";
    libname wrk sas7bdat "%sysfunc(pathname(work))";
    proc r;
    submit;
    source("C:/Program Files/R/R-3.3.2/etc/Rprofile.site", echo=T);
    library(haven);
    have<-read_sas("d:/sd1/have.sas7bdat");
    want<-split(have, list(have$WEIGHT), drop = TRUE);
    list2env(want,envir=.GlobalEnv);
    endsubmit;
    import r=OW  data=wrk.wantwpsow;
    import r=NW  data=wrk.wantwpsnw;
    run;quit;
    ');

    proc print data=wantwpsnw(obs=4);
    run;quit;

    Up to 40 obs from wantwpsnw total obs=15

    Obs    WEIGHT    ID    TREATMENT    KCAL

      1      NW      1         A         400
      2      NW      2         A         500
      3      NW      4         A         800
      4      NW      6         A         500


  7. DOSUBL datastep inside proc sql
  ==================================


     * I keep an emty dataset around because sql requires on even when you don't need \
one;

     data sasuser.empty;
       rc=0;
     run;quit;

     %symdel NW OW weights  / nowarn; * just in case you rerun;
     proc sql;
        select
           max(weight) into :weights separated by " "
        from
           have
        group
           by weight
        ;
        select
           dosubl('
            data &weights;
               set have;
               select (weight);
                 %array(wgts,values=&weights)
                 %do_over(wgts,phrase=%str(when ("?") output ?;))
               end;
            run;quit;') as rc
        from
           sasuser.empty
     ;quit;


  8. SQL Array
  ==================================


     %symdel NW OW weights  / nowarn; * just in case you rerun;
     proc sql;
        select
           max(weight) into :weights separated by " "
        from
           have
        group
           by weight
        ;
        %array(wgts,values=&weights)
        %do_over(wgts,phrase=%str(
           proc sql;
            create table ? as select *
            from have where weight="?";quit;))
     ;quit;


  9. Datastep and macro  Chris Hemedinger
  ========================================

     Chris Hemedinger

     %let TABLE=sashelp.cars;
     %let COLUMN=origin;

     proc sql noprint;
     /* build a mini program for each value */
     /* create a table with valid chars from data value */
     select distinct
        cat("DATA out_",compress(&COLUMN.,,'kad'),
        "; set &TABLE.(where=(&COLUMN.='", &COLUMN.,
        "')); run;") into :allsteps separated by ';'
       from &TABLE.;
     quit;

     /* macro that includes the program we just generated */
     %macro runSteps;
      &allsteps.;
     %mend;

     /* and...run the macro when ready */
     %runSteps;


  10. DOSUBL in one SQL selct statement
  =====================================

     proc sql noprint;

       /* build a mini program for each value */
       /* create a table with valid chars from data value */

       select
          distinct
            dosubl(catx(" ",
              "data",sex
                ,";set sashelp.class(where=(sex=",quote(sex)
                ,"));run;quit;"
          ))
       from
          sashelp.class

     ;quit;




        cat("DATA out_",compress(&COLUMN.,,'kad'),
        "; set &TABLE.(where=(&COLUMN.='", &COLUMN.,
        "')); run;") into :allsteps separated by ';'
       from &TABLE.;
     quit;

     /* macro that includes the program we just generated */
     %macro runSteps;
      &allsteps.;
     %mend;

     /* and...run the macro when ready */
     %runSteps;

  11. Call execute large dataset solution?

      * put the levels of sex into &sec = "F" "M";
      proc sql;
        select distinct quote(strip(sex)) into :sex separated by ' ' from \
sashelp.class  ;quit;

      %put &=sex;    /* the levels "F" "M";
      %put &=sqlobs; /* then number of levels */

      data _null_;
         length sas $200;
         array arysex[&sqlobs] $32 (&sex);
         sas='data ';
         call execute(sas);

         do i=1 to &sqlobs;
            sas=arysex[i];
            call execute(sas);
         end;

         /* change dataset and sex for other data */
         sas=';set sashelp.class; select (sex);';
         call execute(sas);

         do i=1 to &sqlobs;
            sas=catx(' ',cats("when ('",arysex[i],"')"),'output',arysex[i],';');
            call execute(sas);
            put sas;
         end;

         sas='otherwise;end;run;quit;';
         call execute(sas);
      run;quit;


/* T3101140 Split a dataset by sets of variables using SQL

Same results in WPS and SAS

github
https://github.com/rogerjdeangelis/utl_split_a_dataset_by_sets_of_variables_using_sql

see for do_over macro
https://goo.gl/EUYyaB
https://github.com/rogerjdeangelis/utl_sql_looping_or_using_arrays_in_sql_do_over_macro/blob/master/utl_sql_looping_or_using_arrays_in_sql_do_over.sas



stackoverflow
https://stackoverflow.com/questions/48969348/sas-proc-sql-select-columns-belonging-together


INPUT
=====

   RULES (USE SQL and create two datasets C1 and C2

    1.  Create dataset C1 with varaibles _C10_--_C19_
    2.  Create dataset C2 with varaibles _C20_--_C29_


 WORK.HAVE  total obs=1

 Obs _C10_ _C11_ _C12_ _C13_ _C14_ _C15_ _C16_ _C17_ _C18_ _C19_

  1  37851 52717  6400 35160 29928  8803 45861 11191 34847 13532

 Obs _C20_ _C21_ _C22_ _C23_  _C24_ _C25_ _C26_ _C27_ _C28_ _C29_

  1  42999 38913  5202 32790 100150 35176 43656 61507  9979 37850


PROCESS
=======

  proc sql;
     create
        table c1 as
     select
        %array(Cs,values=1-9)
        %do_over(cs,phrase=_C1?_,between=comma)
     from
        have
     ;
     create
        table c2 as
     select
        %do_over(cs,phrase=_C2?_,between=comma)
     from
        have
     ;
  quit;


OUTPUT
======

 WORK.C1 total obs=1

  Obs _C11_ _C12_ _C13_ _C14_ _C15_ _C16_ _C17_ _C18_ _C19_

   1  52717  6400 35160 29928  8803 45861 11191 34847 13532


 WORK.C1 total obs=1

  Obs _C21_ _C22_ _C23_  _C24_ _C25_ _C26_ _C27_ _C28_ _C29_

   1  38913  5202 32790 100150 35176 43656 61507  9979 37850

*                _              _       _
 _ __ ___   __ _| | _____    __| | __ _| |_ __ _
> '_ ` _ \ / _` | |/ / _ \  / _` |/ _` | __/ _` |
> > > > > > (_| |   <  __/ | (_| | (_| | || (_| |
> _| |_| |_|\__,_|_|\_\___|  \__,_|\__,_|\__\__,_|

;

proc report data=sashelp.cars nowd missing out=have(keep=_C10_--_C29_);
cols make, weight;
define make / across;
define weight / sum;
run;quit;

*          _       _   _
 ___  ___ | |_   _| |_(_) ___  _ __
/ __|/ _ \| | | | | __| |/ _ \| '_ \
\__ \ (_) | | |_| | |_| | (_) | | | |
> ___/\___/|_|\__,_|\__|_|\___/|_| |_|

;


* SAS;
proc sql;
   create
      table c1 as
   select
      %array(Cs,values=1-9)
      %do_over(cs,phrase=_C1?_,between=comma)
   from
      have
   ;
   create
      table c2 as
   select
      %do_over(cs,phrase=_C2?_,between=comma)
   from
      have
   ;
quit;


* WPS;
%utl_submit_wps64('
libname wrk "%sysfunc(pathname(work))";

 proc sql;
   create
      table c1 as
   select
      %array(Cs,values=1-9)
      %do_over(cs,phrase=_C1?_,between=comma)
   from
      wrk.have
   ;
   create
      table c2 as
   select
      %do_over(cs,phrase=_C2?_,between=comma)
   from
      wrk.have
   ;
 quit;

');

/* T3099810 StackOverflow SAS: Split row into two columns with sas proc transpose

github
https://gist.github.com/rogerjdeangelis/b13b651116304c000749b2fca9bba2ab

https://goo.gl/rj7Diz
https://stackoverflow.com/questions/47849771/split-row-into-two-columns-with-sas-proc-transpose




 WORK.HAVE total obs=3                             |      RULES
                                                   |
   UNIT    A1     B1     A2     B2     A3     B3   | unit  rows  A     B
                                                   |
    1      A11    B11    A21    B21    A31    B31  |   1    1    A11   B11  for \
unit=1 create 3 rows 2 cols  2      A12    B12    A22    B22    A32    B32  |   1    \
2    A21   B21  3      A13    B13    A23    B23    A33    B33  |   1    3    A31   \
B31  |
                                                   |   2    1    A12   B12
                                                   |   2    2    A22   B22
                                                   |   2    3    A32   B32
 PROCESS
========

  proc transpose data=have out=havxpo;
     var a1-a3 b1-b3;
     by unit;
  run;quit;

  data want;
    * get the dimsnsion of the array ;
    if _n_=0 then do;
      %let rc=dosubl('
          proc sql;
            select max(substr(_name_,2)) into :dim trimmed
            from havxpo
          ;quit;
      ');
    end;

    retain unit row a b x1-x%eval(&dim.*2);
    array mat3x2[3,2] $3 x1-x%eval(&dim.*2);
    * load 3x2 array, temporary would be better;
    do row=1 by 1 until(last.unit);
      set havXpo;
      by unit;
      select ;
        when ( row <= 3 ) mat3x2[row,1]=col1;
        otherwise         mat3x2[row-3,2]=col1;
      end;
    end;
    * output array 2 columns at a time;
    do row=1 to 3;
         a=mat3x2[row,1];
         b=mat3x2[row,2];
         keep unit row a b;
         output;
    end;
  run;quit;

OUTPUT
======

  WORK.WANT total obs=9

     UNIT    ROW     A      B

      1       1     A11    B11
      1       2     A21    B21
      1       3     A31    B31

      2       1     A12    B12
      2       2     A22    B22
      2       3     A32    B32

      3       1     A13    B13
      3       2     A23    B23
      3       3     A33    B33

*                _              _       _
 _ __ ___   __ _| | _____    __| | __ _| |_ __ _
> '_ ` _ \ / _` | |/ / _ \  / _` |/ _` | __/ _` |
> > > > > > (_| |   <  __/ | (_| | (_| | || (_| |
> _| |_| |_|\__,_|_|\_\___|  \__,_|\__,_|\__\__,_|

;
data have;
   input (unit  A1  B1  A2  B2  A3  B3) ($);
cards4;
 1 A11 B11 A21 B21 A31 B31
 2 A12 B12 A22 B22 A32 B32
 3 A13 B13 A23 B23 A33 B33
;;;;
run;quit;

proc transpose data=have out=havxpo;
   var a1-a3 b1-b3;
   by unit;
run;quit;

/*
Up to 40 obs WORK.HAVXPO total obs=18

    UNIT    _NAME_    COL1

     1        A1      A11
     1        A2      A21
     1        A3      A31
     1        B1      B11
     1        B2      B21
     1        B3      B31
     2        A1      A12
     2        A2      A22
     2        A3      A32
     2        B1      B12
     2        B2      B22
     2        B3      B32
     3        A1      A13
     3        A2      A23
     3        A3      A33
     3        B1      B13
     3        B2      B23
     3        B3      B33
*/

data havRow;
  if _n_=0 then do;
    %let rc=dosubl('
        proc sql;
          select max(substr(_name_,2)) into :dim trimmed
          from havxpo
        ;quit;
    ');
  end;
  retain unit row a b x1-x%eval(&dim.*2);
  array mat3x2[3,2] $3 x1-x%eval(&dim.*2);
  do row=1 by 1 until(last.unit);
    set havXpo;
    by unit;
    select ;
      when ( row <= 3 ) mat3x2[row,1]=col1;
      otherwise         mat3x2[row-3,2]=col1;
    end;
  end;
  do row=1 to 3;
       a=mat3x2[row,1];
       b=mat3x2[row,2];
       keep unit row a b;
       output;
  end;
run;quit;


unit  rows  A     B
  1    1    A11   B11
  1    2    A21   B21
  1    3    A31   B31
  2    1    A12   B12
  2    2    A22   B22
  2    3    A32   B32


[prev in list] [next in list] [prev in thread] [next in thread] 

Configure | About | News | Add a list | Sponsored by KoreLogic