ulimit -n 4096 pdfunite Data/Raw_Data/Complete_PDF_Data/*.pdf out.pdf pdftotext "out.pdf" - > out.txt rm out.pdf a=$(cat out.txt) #No "~" were seen in the data but to be safe remove all "~' if they exist so "~" can be used as deliminator of data. a=$(echo -e $a |sed 's/~/--/g') #Remove all ":" that are a column indicator a=$(echo -e $a |sed 's/Abbreviation:/~Abbreviation~/g') a=$(echo -e $a |sed 's/Original Authorized Date:/~Original Authorized Date~/g') a=$(echo -e $a |sed 's/Authorized Date:/~Authorized Date~/g') a=$(echo -e $a |sed 's/Certificate Number:/~Certificate Number~/g') a=$(echo -e $a |sed 's/Certificate Status:/~Certificate Status~/g') a=$(echo -e $a |sed 's/Certificate Type:/Certificate Type~/g') a=$(echo -e $a |sed 's/Company Address:/~Company Address~/g') a=$(echo -e $a |sed 's/Company Name:/~Company Name~/g') a=$(echo -e $a |sed 's/Division Name:/~Division Name~/g') a=$(echo -e $a |sed 's/Expiration Date:/~Expiration Date~/g') a=$(echo -e $a |sed 's/Extension Date:/~Extension Date~/g') a=$(echo -e $a |sed 's/Scope:/~Scope~/g') #"Sites:" indicates that there a multiple certificates tied to the main certificate. Add a indicator that this row of data contains multiple certificates and should be processed later in the "b" variable. a=$(echo -e $a |sed 's/Sites:/HEAD_CERT/g') a=$(echo -e $a |sed 's/Scope Statement:/~Scope Statement~/g') a=$(echo -e $a | sed '/~/s/^/~/' |sed 's/$/ /' | tr -s "\n" | awk '{print}' ORS='' | sed 's/ CA Connect CERTIFICATE HOLDER DETAILS //g' | sed 's/CA Connect CERTIFICATE HOLDER DETAILS //g' | sed 's/ ~/~/g'| sed 's/~ /~/g') ######################################## #Clean up these lines using ~ as a deliminator and making the file look tidy. a=$(echo -e $a | sed '/~/s/^/~/' |sed 's/$/ /' | tr -s "\n" | awk '{print}' ORS='' | sed 's/^L//g' | sed 's/ CA Connect CERTIFICATE HOLDER DETAILS //g' | sed 's/CA Connect CERTIFICATE HOLDER DETAILS //g' | sed 's/Sites:/HAS_SUB_CERT:TRUE~~~~Sites:/g' |sed 's/~~~~/\n/g' | sed 's/ ~/~/g'| sed 's/~ /~/g') #Replace the cert type of NA, with NAC a=$(echo -e $a |sed 's/Certificate Type~NA/Certificate Type~NAC/g') #Add a header with many "~" that way R wont truncate the data when there are less rows than expected. R will do most of the manipulation of the data to put it into a standard table. The "Certificate Type" always begins a new entry, so add a new line after this. a=$(echo -e $a| sed '1 s/^/~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~/' |sed 's/Certificate Type/\nCertificate Type/g' ) ############################ #Some nuclear certificates have sub certificates that apply to certain things. Make B pull out these lines for separate processing b=$(echo -e "$a" | grep HEAD_CERT | sed 's/^/HAS_SUB_CERT~TRUE~/' |sed 's/HEAD_CERT/\n/' ) b=$(echo -e "$b" | sed 's/Certification Number:/\nSUB_CERT~TRUE~Certificate Number~/g' |sed -r '/^\s*$/d'| sed 's/ ~/~/g'| sed 's/~ /~/g' ) a=$(echo -e "$a" | grep -v HEAD_CERT) #Combine the two data types "a" being all normal certificates, "b" being all certificates that have/are sub certificates. c=$(echo -e "${a}\n${b}") #Remove any duplicate lines c=$(echo -e "$c" |sort | uniq -u) #Write to the final file for processing in R. echo "$c" > ./Data/PROCESSED_DATA/temp.tsv #rm ./PROCESSED_DATA/out.txt #Rscript R-Scripts/R_Clean.r #rm ./PROCESSED_DATA/temp.tsv #Rscript R-Scripts/MAKE_REG_DAT.r