95 lines
7.0 KiB
R
95 lines
7.0 KiB
R
library(tidyverse)
|
|
|
|
#SAVE FOLDERS
|
|
if(!file.exists("./Data/PROCESSED_DATA/RDS")){dir.create(path="./Data/PROCESSED_DATA/RDS",recursive=TRUE)}
|
|
if(!file.exists("./Data/PROCESSED_DATA/CSV")){dir.create(path="./Data/PROCESSED_DATA/CSV",recursive=TRUE)}
|
|
|
|
temp <- read_delim("Data/PROCESSED_DATA/temp.tsv",trim_ws=TRUE,col_names=FALSE,delim="~")
|
|
problems()$expected %>% unique
|
|
temp <- temp[-1,]
|
|
|
|
ONE <- temp[,c(!rep(c(TRUE,FALSE),ncol(temp)/2))]
|
|
TWO <- temp[,c(rep(c(TRUE,FALSE),ncol(temp)/2))]
|
|
|
|
STORE <- list()
|
|
min(which(is.na(ONE[1,]))-1)
|
|
for(x in 1:nrow(temp)){
|
|
#Some shortcuts used. Generally this should be one minus the min value. However some entries have the first column as a NA, removing the first column fixes both issues.
|
|
MX_COL <- min(which(is.na(ONE[x,-1])))
|
|
DF <- ONE[x,1:MX_COL]
|
|
colnames(DF) <- gsub("DIVISION","DIV",gsub("EXTENSION","EXT",gsub("CERTIFICATION","CERT",gsub("NUMBER","NUM",gsub("EXPIRATION","EXP",gsub("AUTHORIZED","AUTH",gsub("ORIGINAL","ORIG",gsub("ADDRESS","ADDR",gsub("COMPANY","CO",gsub("ABBREVIATION","ABB",gsub("CERTIFICATE","CERT",gsub("ORIGINAL AUTHORIZED DATE","ORIG_AUTH_DATE",gsub(" ","_",toupper(c(TWO[x,1:(MX_COL)] ) ))))))))))))))
|
|
|
|
if("SCOPE_STATEMENT" %in% colnames(DF) ){
|
|
try(DF <- DF %>% rename(SCOPE=SCOPE_STATEMENT))
|
|
}
|
|
|
|
STORE[[length(STORE)+1]] <- DF
|
|
}
|
|
for(x in STORE){
|
|
print(x)
|
|
if(!exists("RES")){RES <- x} else{RES <- full_join(RES,x)}
|
|
}
|
|
print("Done!")
|
|
#There is a CERT type written out as "NA" R is reading this a NA value. Changing these to NAC. Skipping sub certs which inherit there type from the parent certificate.
|
|
RES[is.na(RES$CERT_TYPE),"CERT_TYPE"] <- "NAC"
|
|
##Fill in missing values of the sub cert and cert parent indicators.
|
|
RES$SUB_CERT <- as.logical(!is.na(RES$SUB_CERT))
|
|
RES$HAS_SUB_CERT <- as.logical(!is.na(RES$HAS_SUB_CERT))
|
|
####Create a column called MAIN_CERT that is the parent, in case a record is a sub cert.
|
|
#Filter out all records that are a SUB_CERT, find the last instance of a "-" and pull out the certificate number only up to that point.
|
|
RES_SUB_CERTS <- RES %>% filter(SUB_CERT) %>% mutate(MAIN_CERT=substr(CERT_NUM,1,tail(unlist(gregexpr('-', CERT_NUM))-1, n=1)) )
|
|
#For all records that are not a subcert make the MAIN_CERT equal to the CERT_NUM. This data will be filtered in the next step so that only non-subcerts remain.
|
|
RES$MAIN_CERT <- RES$CERT_NUM
|
|
#Combine the data that is not a sub_cert with the data that is a sub_cert, clean the order of selection.
|
|
RES <- RES %>% filter(!SUB_CERT) %>% rbind(RES_SUB_CERTS) %>% select(MAIN_CERT,everything())
|
|
#Apply Cert values to child certifications
|
|
RES <- RES %>% group_by(MAIN_CERT) %>% mutate(CERT_TYPE=max(CERT_TYPE,na.rm=TRUE),CERT_STATUS=max(CERT_STATUS,na.rm=TRUE),CO_NAME=max(CO_NAME,na.rm=TRUE),ABB=ifelse(n()!=sum(is.na(ABB)),max(ABB,na.rm=TRUE),NA),CO_ADDR=ifelse(n()!=sum(is.na(CO_ADDR)),max(CO_ADDR,na.rm=TRUE),NA),DIV_NAME=ifelse(n()!=sum(is.na(DIV_NAME)),max(DIV_NAME,na.rm=TRUE),NA)) %>% ungroup
|
|
#Drop if there is no Address (this seems rare but a couple were found)
|
|
RES <- RES %>% filter(!is.na(CO_ADDR))
|
|
RES$AUTH_DATE <- as.Date(RES$AUTH_DATE,'%m/%d/%Y')
|
|
RES$ORIG_AUTH_DATE <- as.Date(RES$ORIG_AUTH_DATE,'%m/%d/%Y')
|
|
RES$EXP_DATE <- as.Date(RES$EXP_DATE,'%m/%d/%Y')
|
|
RES$EXT_DATE <- as.Date(RES$EXT_DATE,'%m/%d/%Y')
|
|
colnames(RES)
|
|
TBL <- rbind(read_csv("Data/Raw_Data/Table_Data/ASME_Active_Certificate_Table_Data.csv"),read_csv("Data/Raw_Data/Table_Data/ASME_Terminated_Certificate_Table_Data.csv")) %>% rename(CO_NAME2=`Company Name`,DIV_NAME2=`Division Name`,ABB2=Abbrev.,PLANT_ADDRESS=`Plant Address`,STATE2=`State/Province`,CERT_TYPE2=Type,CERT_STATUS2=Status,COUNTRY=`Country/Region`,MAIN_CERT=Certificate) %>% unique
|
|
|
|
UPDATE <- read_csv("Data/Raw_Data/Table_Data/Updated_Data.csv")[,-1] %>% rename(CO_NAME2=`Company Name`,DIV_NAME2=`Division Name`,ABB2=Abbrev.,PLANT_ADDRESS=`Plant Address`,STATE2=`State/Province`,CERT_TYPE2=Type,CERT_STATUS2=Status,COUNTRY=`Country/Region`,MAIN_CERT=Certificate) %>% unique
|
|
TBL <- TBL %>% filter(!(MAIN_CERT %in% (UPDATE %>% pull(MAIN_CERT))) )
|
|
TBL <- rbind(TBL,UPDATE)
|
|
RES_NAMES <- RES %>% select(MAIN_CERT,CO_NAME)
|
|
DROP <- TBL %>% unique %>% group_by(MAIN_CERT) %>% filter(n()>1) %>% arrange(MAIN_CERT) %>% left_join(RES_NAMES) %>% select(MAIN_CERT,CO_NAME,CO_NAME2) %>% mutate(PREFER=ifelse(toupper(CO_NAME)==toupper(CO_NAME2),1,0)) %>% filter(PREFER==0) %>% select(MAIN_CERT,CO_NAME2) %>% ungroup
|
|
TBL <- TBL %>% anti_join(DROP) %>% unique
|
|
|
|
MISSING_IN_TBL <- RES %>% full_join(TBL) %>% filter(is.na(CO_NAME2)) %>% select(CO_NAME) %>% unique
|
|
EXISTING_CERTS <- TBL %>% pull(MAIN_CERT)
|
|
#
|
|
UPDATE_TBL <- read_csv("Data/Raw_Data/Table_Data/Updated_Missing_Data.csv") %>% rename(CO_NAME2=`Company Name`,DIV_NAME2=`Division Name`,ABB2=Abbrev.,PLANT_ADDRESS=`Plant Address`,STATE2=`State/Province`,CERT_TYPE2=Type,CERT_STATUS2=Status,COUNTRY=`Country/Region`,MAIN_CERT=Certificate) %>% unique %>% filter(!(MAIN_CERT %in% EXISTING_CERTS))
|
|
TBL <- rbind(TBL,UPDATE_TBL)
|
|
#
|
|
EXISTING_CERTS <- TBL %>% pull(MAIN_CERT)
|
|
UPDATE_TBL <- read_csv("Data/Raw_Data/Table_Data/Updated_Missing_Data_2.csv")[,-1] %>% rename(CO_NAME2=`Company Name`,DIV_NAME2=`Division Name`,ABB2=Abbrev.,PLANT_ADDRESS=`Plant Address`,STATE2=`State/Province`,CERT_TYPE2=Type,CERT_STATUS2=Status,COUNTRY=`Country/Region`,MAIN_CERT=Certificate) %>% unique %>% filter(!(MAIN_CERT %in% EXISTING_CERTS))
|
|
TBL <- rbind(TBL,UPDATE_TBL)
|
|
#
|
|
EXISTING_CERTS <- TBL %>% pull(MAIN_CERT)
|
|
UPDATE_TBL <- read_csv("Data/Raw_Data/Table_Data/Updated_Missing_Data3.csv")[,-1] %>% rename(CO_NAME2=`Company Name`,DIV_NAME2=`Division Name`,ABB2=Abbrev.,PLANT_ADDRESS=`Plant Address`,STATE2=`State/Province`,CERT_TYPE2=Type,CERT_STATUS2=Status,COUNTRY=`Country/Region`,MAIN_CERT=Certificate) %>% unique %>% filter(!(MAIN_CERT %in% EXISTING_CERTS))
|
|
TBL <- rbind(TBL,UPDATE_TBL)
|
|
|
|
|
|
MISSING_IN_TBL <- RES %>% full_join(TBL) %>% filter(is.na(CO_NAME2)) %>% select(CO_NAME) %>% unique
|
|
MISSING_IN_PDFS <- RES %>% full_join(TBL) %>% filter(is.na(CO_NAME)) %>% select(CO_NAME2) %>% unique
|
|
write_csv(MISSING_IN_TBL,"Missing_From_Table.csv")
|
|
write_csv(MISSING_IN_PDFS,"Missing_From_PDS.csv")
|
|
RES <- RES %>% select(MAIN_CERT,CERT_NUM,CERT_TYPE,COUNTRY,STATE=STATE2,CITY=City,CERT_TYPE,CERT_STATUS,ORIG_AUTH_DATE,AUTH_DATE,EXP_DATE,CO_NAME=CO_NAME2,PLANT_ADDRESS,SCOPE)
|
|
|
|
RES$CERT_TYPE <- ifelse(RES$CERT_TYPE=='NAC (Quality Assurance Program)',"NUA",RES$CERT_TYPE)
|
|
RES$CERT_TYPE <- ifelse(RES$CERT_TYPE=='NPT (Quality Assurance Program)',"NPT",RES$CERT_TYPE)
|
|
RES <- RES %>% group_by(MAIN_CERT) %>% mutate(NUM_SUB_CERT=n()-1) %>% ungroup
|
|
RES <- RES %>% mutate(ORIG_AUTH_YEAR=year(ORIG_AUTH_DATE),AUTH_YEAR=year(AUTH_DATE),ORIG_AUTH_MONTH=year(ORIG_AUTH_DATE),AUTH_MONTH=year(AUTH_DATE))
|
|
#Add Nuclear indicator (all license with a nuclear related certificate)
|
|
RES <- RES %>% mutate(NUCLEAR=CERT_TYPE %in%c("N","N3","NAC","NPT","NS","NUA","NV","OWN","G","GC","MO"))
|
|
#RES <- RES %>% filter(CERT_TYPE!="OWN",CERT_TYPE!="NAC")
|
|
##SAVE
|
|
saveRDS(RES,"./Data/PROCESSED_DATA/RDS/ASME.Rds")
|
|
write_csv(RES,"./Data/PROCESSED_DATA/CSV/ASME.csv",quote="all")
|
|
|