'SAS-L: Search all pdfs for an arbitrary word in multiple pdf files'

[prev in list] [next in list] [prev in thread] [next in thread] 

List:       sas-l
Subject:    SAS-L: Search all pdfs for an arbitrary word in multiple pdf files
From:       Roger Deangelis <roger_deangelis () COMCAST ! NET>
Date:       2024-02-25 18:56:58
Message-ID: 2579376146762481.WA.rogerdeangeliscomcast.net () listserv ! uga ! edu
[Download RAW message or body]

%let pgm=utl-search-all-pdfs-for-an-arbitrary-word-in-mutuiple-pdf-files;

Search all pdfs for an arbitrary word in multiple pdf files

github
http://tinyurl.com/yc6a66zw
https://github.com/rogerjdeangelis/utl-search-all-pdfs-for-an-arbitrary-word-in-mutuiple-pdf-file


related repos on end

see
https://goo.gl/8eFoWu
https://stackoverflow.com/questions/44370333/classifying-pdf-text-documents-based-on-the-presence-absence-of-specific-words-i


Davids profile
https://stackoverflow.com/users/3048453/david

/*               _     _
 _ __  _ __ ___ | |__ | | ___ _ __ ___
> `_ \| `__/ _ \| `_ \| |/ _ \ `_ ` _ \
> > _) | | | (_) | |_) | |  __/ | | | | |
> .__/|_|  \___/|_.__/|_|\___|_| |_| |_|
> _|
*/

/**************************************************************************************************************************/
 /*                                       |                                           \
|                           */ /*               INPUT                   |            \
PROCESS                                 |       OUTPUT              */ /*             \
|                                                    |                           */ \
/* d:/pdffnd/roger1.pdf                  |    Search all pdfs in a directory for \
'alfred'     | file          relevance   */ /*                                       \
|    case insensitive                                |                           */ \
/*    NAME       SEX AGE  HEIGHTWEIGHT   |                                            \
| roger1.pdf    Present     */ /*                                       | %let \
searchword = alfred;                          | roger2.pdf    Not Present */ /*    \
Alfred      M   14   69.0  112.5 * |                                                  \
| roger3.pdf    Not Present */ /*    Carol       F   14   62.8  102.5   |             \
|                           */ /*    Jeffrey     M   13   62.5   84.0   | search_pdf \
<- function(pdf_files                   | Note relevance is a long  */ /*    John     \
M   12   59.0   99.5   |     ,search_term                                   | \
variable name in a V5     */ /*    Mary        F   15   66.5  112.0   |     ,n_words \
= 100) {                              | transport file            */ /*    Robert     \
M   12   64.8  128.0   |   res_list <- lapply(pdf_files, function(file) {   |         \
*/ /*    Ronald      M   15   67.0  133.0   |     content <- pdf_text(file);          \
|                           */ /*                                       |     \
content2 <- tolower(content);                  |                           */ /* \
d:/pdffnd/roger2.pdf                  |     content2 <- gsub("\\n", "", content2);    \
|                           */ /*                                       |     \
content2 <- gsub("[[:punct:]]", "", content2); |                           */ /*    \
NAME       SEX AGE  HEIGHTWEIGHT   |     content_vec <- strsplit(content2, " ")[[1]]; \
|                           */ /*                                       |     found \
<- search_term %in% content_vec[1:n_words|;                          */ /*    Jane    \
F   12   59.8   84.5   |     res <- data_frame(file = file,                 |         \
*/ /*    Janet       F   15   62.5  112.5   |        relevance =                      \
|                           */ /*    Jeffrey     M   13   62.5   84.0   |          \
ifelse(found, "Present", "Not Present")); |                           */ /*    John   \
M   12   59.0   99.5   |     return(res);                                   |         \
*/ /*    Judy        F   14   64.3   90.0   |   });                                   \
|                           */ /*    Mary        F   15   66.5  112.0   |   res_df <- \
bind_rows(res_list);                   |                           */ /*              \
|   return(res_df);                                  |                           */ \
/* d:/pdffnd/roger3.pdf                  | };                                         \
|                           */ /*                                       | \
want<-as.data.frame(search_pdf(pdf_files           |                           */ /*  \
NAME       SEX AGE  HEIGHT EIGHT    | , search_term = "&searchword", n_words = \
1000));   |                           */ /*                                       |   \
|                           */ /*   Barbara     F   13   65.3   98.0    |             \
|                           */ /*   Jeffrey     M   13   62.5   84.0    |             \
|                           */ /*   John        M   12   59.0   99.5    |             \
|                           */ /*   Mary        F   15   66.5  112.0    |             \
|                           */ /*   Thomas      M   11   57.5   85.0    |             \
|                           */ /*                                       |             \
|                           */ \
/**************************************************************************************************************************/


/*                   _
(_)_ __  _ __  _   _| |_
> > `_ \| `_ \| | | | __|
> > > > > > _) | |_| | |_
> _|_| |_| .__/ \__,_|\__|
        |_|
*/

/*----                                                                   ----*/
/*---- CREATE THREE PDFS                                                 ----*/
/*----                                                                   ----*/

data _null_;
  do fyl="d:/pdffnd/roger1.pdf","d:/pdffnd/roger2.pdf","d:/pdffnd/roger3.pdf";
     call symputx('fyl',fyl);
     cnt+1;
     call symputx('seed',put(cnt,2.));
     rc=dosubl('
        ods pdf file="&fyl";
          proc print data=sashelp.class(where=(uniform(&seed)<.3));
          run;quit;
        ods pdf close;
        run;quit;

     ');
  end;
stop;
run;quit;

/**************************************************************************************************************************/
 /*                                                                                   \
*/ /*               INPUT                                                             \
*/ /*                                                                                 \
*/ /* d:/pdffnd/roger1.pdf                                                            \
*/ /*                                                                                 \
*/ /*    NAME       SEX AGE  HEIGHTWEIGHT                                             \
*/ /*                                                                                 \
*/ /*    Alfred      M   14   69.0  112.5                                             \
*/ /*    Carol       F   14   62.8  102.5                                             \
*/ /*    Jeffrey     M   13   62.5   84.0                                             \
*/ /*    John        M   12   59.0   99.5                                             \
*/ /*    Mary        F   15   66.5  112.0                                             \
*/ /*    Robert      M   12   64.8  128.0                                             \
*/ /*    Ronald      M   15   67.0  133.0                                             \
*/ /*                                                                                 \
*/ /* d:/pdffnd/roger2.pdf                                                            \
*/ /*                                                                                 \
*/ /*    NAME       SEX AGE  HEIGHTWEIGHT                                             \
*/ /*                                                                                 \
*/ /*    Jane        F   12   59.8   84.5                                             \
*/ /*    Janet       F   15   62.5  112.5                                             \
*/ /*    Jeffrey     M   13   62.5   84.0                                             \
*/ /*    John        M   12   59.0   99.5                                             \
*/ /*    Judy        F   14   64.3   90.0                                             \
*/ /*    Mary        F   15   66.5  112.0                                             \
*/ /*                                                                                 \
*/ /* d:/pdffnd/roger3.pdf                                                            \
*/ /*                                                                                 \
*/ /*   NAME       SEX AGE  HEIGHT EIGHT                                              \
*/ /*                                                                                 \
*/ /*   Barbara     F   13   65.3   98.0                                              \
*/ /*   Jeffrey     M   13   62.5   84.0                                              \
*/ /*   John        M   12   59.0   99.5                                              \
*/ /*   Mary        F   15   66.5  112.0                                              \
*/ /*   Thomas      M   11   57.5   85.0                                              \
*/ /*                                                                                 \
*/ /**************************************************************************************************************************/


/*
 _ __  _ __ ___   ___ ___  ___ ___
> `_ \| `__/ _ \ / __/ _ \/ __/ __|
> > _) | | | (_) | (_|  __/\__ \__ \
> .__/|_|  \___/ \___\___||___/___/
> _|
*/


%symdel word / nowarn;
%utlfkil(d:/xpt/want.xpt);

%let word=alfred;

%utl_submit_r64x(resolve('
library(pdftools);
library(dplyr);
library(Hmisc);
library(SASxport);

pdf_files <- list.files("d:/pdffnd/", full.names = T);
pdf_files;

search_pdf <- function(pdf_files
    ,search_term
    ,n_words = 100) {
  res_list <- lapply(pdf_files, function(file) {
    content <- pdf_text(file);
    content2 <- tolower(content);
    content2 <- gsub("\\n", "", content2);
    content2 <- gsub("[[:punct:]]", "", content2);
    content_vec <- strsplit(content2, " ")[[1]];
    found <- search_term %in% content_vec[1:n_words];
    res <- data_frame(file = file,
       relevance =
         ifelse(found, "Present", "Not Present"));
    return(res);
  });
  res_df <- bind_rows(res_list);
  return(res_df);
};
want<-as.data.frame(search_pdf(pdf_files
, search_term = "&word", n_words = 1000));
want;
for (i in 1:ncol(want)) {
      label(want[,i])<-colnames(want)[i];
      print(label(want[,i])); };
write.xport(want,file="d:/xpt/want.xpt");
'));

proc datasets lib=work nolist mt=data mt=view nodetails;delete want \
want_r_long_names; run;quit;

/*--- handles long variable names by using the label to rename the variables  ----*/

libname xpt xport "d:/xpt/want.xpt";
proc contents data=xpt._all_;
run;quit;

data want_r_long_names;
  %utl_rens(xpt.want) ;
  set want;
run;quit;
libname xpt clear;

proc print ;
run;quit;

/**************************************************************************************************************************/
 /*                                                                                   \
*/ /*        OUTPUT                                                                   \
*/ /*                                                                                 \
*/ /*  file          relevance (long variable name)                                   \
*/ /*                                                                                 \
*/ /*  roger1.pdf    Present                                                          \
*/ /*  roger2.pdf    Not Present                                                      \
*/ /*  roger3.pdf    Not Present                                                      \
*/ /*                                                                                 \
*/ /*  Note relevance is a long                                                       \
*/ /*  variable name in a V5                                                          \
*/ /*  transport file                                                                 \
*/ /*                                                                                 \
*/ /**************************************************************************************************************************/



REPO
----------------------------------------------------------------------------------------------------------------------------------
 https://github.com/rogerjdeangelis/utl-convert-pdf-to-text-using-python-and-r
https://github.com/rogerjdeangelis/utl-create-a-simple-n-percent-clinical-table-in-r-sas-wps-python-output-pdf-rtf-xlsx-html-list
 https://github.com/rogerjdeangelis/utl-creating-identical-pdf-and-powerpoint-slides
https://github.com/rogerjdeangelis/utl-identical-side-by-side-text-and-graphics-in-pdf-and-powerpoint
 https://github.com/rogerjdeangelis/utl-overlaying-histograms-and-scatterplots-in-powerpoint-pdf-and-jpeg
 https://github.com/rogerjdeangelis/utl-putting-a-frame-around-text-in-doc-rtf-and-pdf-ods-destinations-with-and-without-layout
 https://github.com/rogerjdeangelis/utl-removing-unwanted-bookmarks-in-pdf-table-of-contents-toc
 https://github.com/rogerjdeangelis/utl-scraping-pdf-output-for-pdf-tables-and-lists
https://github.com/rogerjdeangelis/utl-side-by-side-proc-report-output-in-pdf-html-and-excel
 https://github.com/rogerjdeangelis/utl_combine_pdf_files_and_delete_pages_from_a_pdf_pyPDF_ghostscript
 https://github.com/rogerjdeangelis/utl_combining_all_pdf_files_in_a_directory
https://github.com/rogerjdeangelis/utl_convert_pdf_tables_to_SAS_WPS_datasets
https://github.com/rogerjdeangelis/utl_convert_pdf_tables_to_sas_tables
https://github.com/rogerjdeangelis/utl_dropping-down-to-R-and-converting-pdfs-to-sas-tables-and-text
 https://github.com/rogerjdeangelis/utl_dropping-down-to-powershell-and-converting-doc-and-rtf-files-to-pdfs
 https://github.com/rogerjdeangelis/utl_ods_pdf_and_rtf_two_different_page_titles_on_the_same_page
 https://github.com/rogerjdeangelis/utl_pdf_graphics_top_40_a_sas_ods_graphics_look_at_chicago_public_schools_salaries_by_job


/*              _
  ___ _ __   __| |
 / _ \ `_ \ / _` |
> __/ | | | (_| |
 \___|_| |_|\__,_|

*/



















































d:/pdffnd/roger1.pdf                                search all pdfs in a directory \
for 'alfred'                     file            relevance  case insensitive
   NAME       SEX    AGE    HEIGHT    WEIGHT                                          \
                d:/pdffnd/roger1.pdf    Present
                                                    %let searchword = alfred;         \
d:/pdffnd/roger2.pdf    Not Present  Alfred      M      14     69.0      112.5 *      \
d:/pdffnd/roger3.pdf    Not Present  Carol       F      14     62.8      102.5
   Jeffrey     M      13     62.5       84.0        search_pdf <- function(pdf_files
   John        M      12     59.0       99.5            ,search_term
   Mary        F      15     66.5      112.0            ,n_words = 100) {
   Robert      M      12     64.8      128.0          res_list <- lapply(pdf_files, \
function(file) {  Ronald      M      15     67.0      133.0            content <- \
                pdf_text(file);
                                                        content2 <- tolower(content);
d:/pdffnd/roger2.pdf                                    content2 <- gsub("\\n", "", \
                content2);
                                                        content2 <- \
gsub("[[:punct:]]", "", content2);  NAME       SEX    AGE    HEIGHT    WEIGHT         \
                content_vec <- strsplit(content2, " ")[[1]];
                                                        found <- search_term %in% \
content_vec[1:n_words];  Jane        F      12     59.8       84.5            res <- \
data_frame(file = file,  Janet       F      15     62.5      112.5               \
relevance =  Jeffrey     M      13     62.5       84.0                 ifelse(found, \
"Present", "Not Present"));  John        M      12     59.0       99.5            \
return(res);  Judy        F      14     64.3       90.0          });
   Mary        F      15     66.5      112.0          res_df <- bind_rows(res_list);
                                                      return(res_df);
d:/pdffnd/roger3.pdf                                };
                                                    \
want<-as.data.frame(search_pdf(pdf_files  NAME       SEX    AGE    HEIGHT    WEIGHT   \
, search_term = "&searchword", n_words = 1000));

  Barbara     F      13     65.3       98.0
  Jeffrey     M      13     62.5       84.0
  John        M      12     59.0       99.5
  Mary        F      15     66.5      112.0
  Thomas      M      11     57.5       85.0


         file            relevance

 d:/pdffnd/roger1.pdf    Present
 d:/pdffnd/roger2.pdf    Not Present
 d:/pdffnd/roger3.pdf    Not Present

















 The WPS System

 Up to 40 obs from wantwps total obs=3

 Obs            FILE            RELEVANCE

  1     d:/pdffnd/roger1.pdf    Relevant     (Alfred is only in this pdf)
  2     d:/pdffnd/roger2.pdf    Irrelevant
  3     d:/pdffnd/roger3.pdf    Irrelevant


* create 3 pdfs;
data _null_;
  do fyl="d:/pdffnd/roger1.pdf","d:/pdffnd/roger2.pdf","d:/pdffnd/roger3.pdf";
     call symputx('fyl',fyl);
     cnt+1;
     call symputx('seed',put(cnt,2.));
     rc=dosubl('
        ods pdf file="&fyl";
          proc print data=sashelp.class(where=(uniform(&seed)<.3));
          run;quit;
        ods pdf close;
        run;quit;

     ');
  end;
stop;
run;quit;


*          _       _   _
 ___  ___ | |_   _| |_(_) ___  _ __
/ __|/ _ \| | | | | __| |/ _ \| '_ \
\__ \ (_) | | |_| | |_| | (_) | | | |
> ___/\___/|_|\__,_|\__|_|\___/|_| |_|

;

see link for commented code

%utl_submit_wps64('
libname wrk "%sysfunc(pathname(work))";
proc r;
submit;
library(pdftools);
library(dplyr);

pdf_files <- list.files("d:/pdffnd/", full.names = T);
pdf_files;

search_pdf <- function(pdf_files, search_term, n_words = 100) {
  res_list <- lapply(pdf_files, function(file) {
    content <- pdf_text(file);
    content2 <- tolower(content);
    content2 <- gsub("\\n", "", content2);
    content2 <- gsub("[[:punct:]]", "", content2);
    content_vec <- strsplit(content2, " ")[[1]];
    found <- search_term %in% content_vec[1:n_words];
    res <- data_frame(file = file,
       relevance = ifelse(found, "Present", "Not Present"));
    return(res);
  });
  res_df <- bind_rows(res_list);
  return(res_df);
};
want<-as.data.frame(search_pdf(pdf_files, search_term = "alfred", n_words = 1000));
want;
endsubmit;
import r=want data=wrk.wantwps;
run;quit;
proc print data=wrk.wantwps;
run;quit;
');


%symdel word / nowarn;
%utlfkil(d:/xpt/want.xpt);

%let word=alfred;

%utl_submit_r64x(resolve('
library(pdftools);
library(dplyr);
library(Hmisc);
library(SASxport);

pdf_files <- list.files("d:/pdffnd/", full.names = T);
pdf_files;

search_pdf <- function(pdf_files
    ,search_term
    ,n_words = 100) {
  res_list <- lapply(pdf_files, function(file) {
    content <- pdf_text(file);
    content2 <- tolower(content);
    content2 <- gsub("\\n", "", content2);
    content2 <- gsub("[[:punct:]]", "", content2);
    content_vec <- strsplit(content2, " ")[[1]];
    found <- search_term %in% content_vec[1:n_words];
    res <- data_frame(file = file,
       relevance =
         ifelse(found, "Present", "Not Present"));
    return(res);
  });
  res_df <- bind_rows(res_list);
  return(res_df);
};
want<-as.data.frame(search_pdf(pdf_files
, search_term = "&word", n_words = 1000));
want;
for (i in 1:ncol(want)) {
      label(want[,i])<-colnames(want)[i];
      print(label(want[,i])); };
write.xport(want,file="d:/xpt/want.xpt");
'));

proc datasets lib=work nolist mt=data mt=view nodetails;delete want \
want_r_long_names; run;quit;

/*--- handles long variable names by using the label to rename the variables  ----*/

libname xpt xport "d:/xpt/want.xpt";
proc contents data=xpt._all_;
run;quit;

data want_r_long_names;
  %utl_rens(xpt.want) ;
  set want;
run;quit;
libname xpt clear;































         ;;;;%end;%mend;/*'*/ \
*);*};*];*/;/*"*/;run;quit;%end;end;run;endcomp;%utlfix;






















* T1005340 StackOverflow R: Classifying PDF Text Documents based on the \
presence/absence of specific words in R

Is Alfred is in any of the three pdfs? Case insensitive.

Fancy word is Text Mining. I prefer 'Find Alfred'.

WORKING CODE WPS/R  (stops after 1000 words)

     search_pdf(pdf_files, search_term = "alfred", n_words = 1000))

see
https://goo.gl/8eFoWu
https://stackoverflow.com/questions/44370333/classifying-pdf-text-documents-based-on-the-presence-absence-of-specific-words-i


Davids profile
https://stackoverflow.com/users/3048453/david


HAVE ( three PDF files )
========================

d:/pdffnd/roger1.pdf

   NAME       SEX    AGE    HEIGHT    WEIGHT

   Alfred      M      14     69.0      112.5   * only in roger1.pdf
   Carol       F      14     62.8      102.5
   Jeffrey     M      13     62.5       84.0
   John        M      12     59.0       99.5
   Mary        F      15     66.5      112.0
   Robert      M      12     64.8      128.0
   Ronald      M      15     67.0      133.0

d:/pdffnd/roger2.pdf

   NAME       SEX    AGE    HEIGHT    WEIGHT

   Jane        F      12     59.8       84.5
   Janet       F      15     62.5      112.5
   Jeffrey     M      13     62.5       84.0
   John        M      12     59.0       99.5
   Judy        F      14     64.3       90.0
   Mary        F      15     66.5      112.0

d:/pdffnd/roger3.pdf

  NAME       SEX    AGE    HEIGHT    WEIGHT

  Barbara     F      13     65.3       98.0
  Jeffrey     M      13     62.5       84.0
  John        M      12     59.0       99.5
  Mary        F      15     66.5      112.0
  Thomas      M      11     57.5       85.0


WANT
====

The WPS System

Up to 40 obs from wantwps total obs=3

Obs            FILE            RELEVANCE

 1     d:/pdffnd/roger1.pdf    Relevant     (Alfred is only in this pdf)
 2     d:/pdffnd/roger2.pdf    Irrelevant
 3     d:/pdffnd/roger3.pdf    Irrelevant

*                _                    _  __
 _ __ ___   __ _| | _____   _ __   __| |/ _|___
> '_ ` _ \ / _` | |/ / _ \ | '_ \ / _` | |_/ __|
> > > > > > (_| |   <  __/ | |_) | (_| |  _\__ \
> _| |_| |_|\__,_|_|\_\___| | .__/ \__,_|_| |___/
                           |_|
;

* create 3 pdfs;
data _null_;
  do fyl="d:/pdf/roger1.pdf","d:/pdf/roger2.pdf","d:/pdf/roger3.pdf";
     call symputx('fyl',fyl);
     cnt+1;
     call symputx('seed',put(cnt,2.));
     rc=dosubl('
        ods pdf file="&fyl";
          proc print data=sashelp.class(where=(uniform(&seed)<.3));
          run;quit;
        ods pdf close;
        run;quit;

     ');
  end;
stop;
run;quit;

*          _       _   _
 ___  ___ | |_   _| |_(_) ___  _ __
/ __|/ _ \| | | | | __| |/ _ \| '_ \
\__ \ (_) | | |_| | |_| | (_) | | | |
> ___/\___/|_|\__,_|\__|_|\___/|_| |_|

;

see link for commented code

%utl_submit_wps64('
libname wrk "%sysfunc(pathname(work))";
proc r;
submit;
library(pdftools);
library(dplyr);

pdf_files <- list.files("d:/pdffnd/", full.names = T);
pdf_files;

search_pdf <- function(pdf_files, search_term, n_words = 100) {
  res_list <- lapply(pdf_files, function(file) {
    content <- pdf_text(file);
    content2 <- tolower(content);
    content2 <- gsub("\\n", "", content2);
    content2 <- gsub("[[:punct:]]", "", content2);
    content_vec <- strsplit(content2, " ")[[1]];
    found <- search_term %in% content_vec[1:n_words];
    res <- data_frame(file = file,
       relevance = ifelse(found, "Relevant", "Irrelevant"));
    return(res);
  });
  res_df <- bind_rows(res_list);
  return(res_df);
};
want<-as.data.frame(search_pdf(pdf_files, search_term = "alfred", n_words = 1000));
want;
endsubmit;
import r=want data=wrk.wantwps;
run;quit;
proc print data=wrk.wantwps;
run;quit;
');


[prev in list] [next in list] [prev in thread] [next in thread]
Configure | About | News | Add a list | Sponsored by KoreLogic