ASME_nuclear/Proc.sh
2026-01-14 16:51:02 -07:00

51 lines
3.3 KiB
Bash

ulimit -n 4096
pdfunite Data/Raw_Data/Complete_PDF_Data/*.pdf out.pdf
pdftotext "out.pdf" - > out.txt
rm out.pdf
a=$(cat out.txt)
#No "~" were seen in the data but to be safe remove all "~' if they exist so "~" can be used as deliminator of data.
a=$(echo -e $a |sed 's/~/--/g')
#Remove all ":" that are a column indicator
a=$(echo -e $a |sed 's/Abbreviation:/~Abbreviation~/g')
a=$(echo -e $a |sed 's/Original Authorized Date:/~Original Authorized Date~/g')
a=$(echo -e $a |sed 's/Authorized Date:/~Authorized Date~/g')
a=$(echo -e $a |sed 's/Certificate Number:/~Certificate Number~/g')
a=$(echo -e $a |sed 's/Certificate Status:/~Certificate Status~/g')
a=$(echo -e $a |sed 's/Certificate Type:/Certificate Type~/g')
a=$(echo -e $a |sed 's/Company Address:/~Company Address~/g')
a=$(echo -e $a |sed 's/Company Name:/~Company Name~/g')
a=$(echo -e $a |sed 's/Division Name:/~Division Name~/g')
a=$(echo -e $a |sed 's/Expiration Date:/~Expiration Date~/g')
a=$(echo -e $a |sed 's/Extension Date:/~Extension Date~/g')
a=$(echo -e $a |sed 's/Scope:/~Scope~/g')
#"Sites:" indicates that there a multiple certificates tied to the main certificate. Add a indicator that this row of data contains multiple certificates and should be processed later in the "b" variable.
a=$(echo -e $a |sed 's/Sites:/HEAD_CERT/g')
a=$(echo -e $a |sed 's/Scope Statement:/~Scope Statement~/g')
a=$(echo -e $a | sed '/~/s/^/~/' |sed 's/$/ /' | tr -s "\n" | awk '{print}' ORS='' | sed 's/ CA Connect CERTIFICATE HOLDER DETAILS //g' | sed 's/CA Connect CERTIFICATE HOLDER DETAILS //g' | sed 's/ ~/~/g'| sed 's/~ /~/g')
########################################
#Clean up these lines using ~ as a deliminator and making the file look tidy.
a=$(echo -e $a | sed '/~/s/^/~/' |sed 's/$/ /' | tr -s "\n" | awk '{print}' ORS='' | sed 's/^L//g' | sed 's/ CA Connect CERTIFICATE HOLDER DETAILS //g' | sed 's/CA Connect CERTIFICATE HOLDER DETAILS //g' | sed 's/Sites:/HAS_SUB_CERT:TRUE~~~~Sites:/g' |sed 's/~~~~/\n/g' | sed 's/ ~/~/g'| sed 's/~ /~/g')
#Replace the cert type of NA, with NAC
a=$(echo -e $a |sed 's/Certificate Type~NA/Certificate Type~NAC/g')
#Add a header with many "~" that way R wont truncate the data when there are less rows than expected. R will do most of the manipulation of the data to put it into a standard table. The "Certificate Type" always begins a new entry, so add a new line after this.
a=$(echo -e $a| sed '1 s/^/~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~/' |sed 's/Certificate Type/\nCertificate Type/g' )
############################
#Some nuclear certificates have sub certificates that apply to certain things. Make B pull out these lines for separate processing
b=$(echo -e "$a" | grep HEAD_CERT | sed 's/^/HAS_SUB_CERT~TRUE~/' |sed 's/HEAD_CERT/\n/' )
b=$(echo -e "$b" | sed 's/Certification Number:/\nSUB_CERT~TRUE~Certificate Number~/g' |sed -r '/^\s*$/d'| sed 's/ ~/~/g'| sed 's/~ /~/g' )
a=$(echo -e "$a" | grep -v HEAD_CERT)
#Combine the two data types "a" being all normal certificates, "b" being all certificates that have/are sub certificates.
c=$(echo -e "${a}\n${b}")
#Remove any duplicate lines
c=$(echo -e "$c" |sort | uniq -u)
#Write to the final file for processing in R.
echo "$c" > ./Data/PROCESSED_DATA/temp.tsv
#rm ./PROCESSED_DATA/out.txt
#Rscript R-Scripts/R_Clean.r
#rm ./PROCESSED_DATA/temp.tsv
#Rscript R-Scripts/MAKE_REG_DAT.r