50 lines
3.3 KiB
Bash
50 lines
3.3 KiB
Bash
pdfunite Data/First_Run/*.pdf out.pdf
|
|
pdftotext "out.pdf" - > out.txt
|
|
rm out.pdf
|
|
a=$(cat out.txt)
|
|
#No "~" were seen in the data but to be safe remove all "~' if they exist so "~" can be used as deliminator of data.
|
|
a=$(echo -e $a |sed 's/~/--/g')
|
|
#Remove all ":" that are a column indicator
|
|
a=$(echo -e $a |sed 's/Abbreviation:/~Abbreviation~/g')
|
|
a=$(echo -e $a |sed 's/Original Authorized Date:/~Original Authorized Date~/g')
|
|
a=$(echo -e $a |sed 's/Authorized Date:/~Authorized Date~/g')
|
|
a=$(echo -e $a |sed 's/Certificate Number:/~Certificate Number~/g')
|
|
a=$(echo -e $a |sed 's/Certificate Status:/~Certificate Status~/g')
|
|
a=$(echo -e $a |sed 's/Certificate Type:/Certificate Type~/g')
|
|
a=$(echo -e $a |sed 's/Company Address:/~Company Address~/g')
|
|
a=$(echo -e $a |sed 's/Company Name:/~Company Name~/g')
|
|
a=$(echo -e $a |sed 's/Division Name:/~Division Name~/g')
|
|
a=$(echo -e $a |sed 's/Expiration Date:/~Expiration Date~/g')
|
|
a=$(echo -e $a |sed 's/Extension Date:/~Extension Date~/g')
|
|
a=$(echo -e $a |sed 's/Scope:/~Scope~/g')
|
|
#"Sites:" indicates that there a multiple certificates tied to the main certificate. Add a indicator that this row of data contains multiple certificates and should be processed later in the "b" variable.
|
|
a=$(echo -e $a |sed 's/Sites:/HEAD_CERT/g')
|
|
a=$(echo -e $a |sed 's/Scope Statement:/~Scope Statement~/g')
|
|
a=$(echo -e $a | sed '/~/s/^/~/' |sed 's/$/ /' | tr -s "\n" | awk '{print}' ORS='' | sed 's/ CA Connect CERTIFICATE HOLDER DETAILS //g' | sed 's/CA Connect CERTIFICATE HOLDER DETAILS //g' | sed 's/ ~/~/g'| sed 's/~ /~/g')
|
|
|
|
########################################
|
|
#Clean up these lines using ~ as a deliminator and making the file look tidy.
|
|
a=$(echo -e $a | sed '/~/s/^/~/' |sed 's/$/ /' | tr -s "\n" | awk '{print}' ORS='' | sed 's/^L//g' | sed 's/ CA Connect CERTIFICATE HOLDER DETAILS //g' | sed 's/CA Connect CERTIFICATE HOLDER DETAILS //g' | sed 's/Sites:/HAS_SUB_CERT:TRUE~~~~Sites:/g' |sed 's/~~~~/\n/g' | sed 's/ ~/~/g'| sed 's/~ /~/g')
|
|
#Replace the cert type of NA, with NAC
|
|
a=$(echo -e $a |sed 's/Certificate Type~NA/Certificate Type~NAC/g')
|
|
#Add a header with many "~" that way R wont truncate the data when there are less rows than expected. R will do most of the manipulation of the data to put it into a standard table. The "Certificate Type" always begins a new entry, so add a new line after this.
|
|
a=$(echo -e $a| sed '1 s/^/~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~/' |sed 's/Certificate Type/\nCertificate Type/g' )
|
|
|
|
############################
|
|
#Some nuclear certificates have sub certificates that apply to certain things. Make B pull out these lines for separate processing
|
|
b=$(echo -e "$a" | grep HEAD_CERT | sed 's/^/HAS_SUB_CERT~TRUE~/' |sed 's/HEAD_CERT/\n/' )
|
|
b=$(echo -e "$b" | sed 's/Certification Number:/\nSUB_CERT~TRUE~Certificate Number~/g' |sed -r '/^\s*$/d'| sed 's/ ~/~/g'| sed 's/~ /~/g' )
|
|
a=$(echo -e "$a" | grep -v HEAD_CERT)
|
|
#Combine the two data types "a" being all normal certificates, "b" being all certificates that have/are sub certificates.
|
|
c=$(echo -e "${a}\n${b}")
|
|
#Remove any duplicate lines
|
|
c=$(echo -e "$c" |sort | uniq -u)
|
|
|
|
#Write to the final file for processing in R.
|
|
echo "$c" > ./Data/PROCESSED_DATA/temp.tsv
|
|
#rm ./PROCESSED_DATA/out.txt
|
|
#Rscript R-Scripts/R_Clean.r
|
|
#rm ./PROCESSED_DATA/temp.tsv
|
|
#Rscript R-Scripts/MAKE_REG_DAT.r
|
|
|