ASME_nuclear/R_Clean.r
2026-01-15 17:08:46 -07:00

95 lines
7.0 KiB
R

library(tidyverse)
#SAVE FOLDERS
if(!file.exists("./Data/PROCESSED_DATA/RDS")){dir.create(path="./Data/PROCESSED_DATA/RDS",recursive=TRUE)}
if(!file.exists("./Data/PROCESSED_DATA/CSV")){dir.create(path="./Data/PROCESSED_DATA/CSV",recursive=TRUE)}
temp <- read_delim("Data/PROCESSED_DATA/temp.tsv",trim_ws=TRUE,col_names=FALSE,delim="~")
problems()$expected %>% unique
temp <- temp[-1,]
ONE <- temp[,c(!rep(c(TRUE,FALSE),ncol(temp)/2))]
TWO <- temp[,c(rep(c(TRUE,FALSE),ncol(temp)/2))]
STORE <- list()
min(which(is.na(ONE[1,]))-1)
for(x in 1:nrow(temp)){
#Some shortcuts used. Generally this should be one minus the min value. However some entries have the first column as a NA, removing the first column fixes both issues.
MX_COL <- min(which(is.na(ONE[x,-1])))
DF <- ONE[x,1:MX_COL]
colnames(DF) <- gsub("DIVISION","DIV",gsub("EXTENSION","EXT",gsub("CERTIFICATION","CERT",gsub("NUMBER","NUM",gsub("EXPIRATION","EXP",gsub("AUTHORIZED","AUTH",gsub("ORIGINAL","ORIG",gsub("ADDRESS","ADDR",gsub("COMPANY","CO",gsub("ABBREVIATION","ABB",gsub("CERTIFICATE","CERT",gsub("ORIGINAL AUTHORIZED DATE","ORIG_AUTH_DATE",gsub(" ","_",toupper(c(TWO[x,1:(MX_COL)] ) ))))))))))))))
if("SCOPE_STATEMENT" %in% colnames(DF) ){
try(DF <- DF %>% rename(SCOPE=SCOPE_STATEMENT))
}
STORE[[length(STORE)+1]] <- DF
}
for(x in STORE){
print(x)
if(!exists("RES")){RES <- x} else{RES <- full_join(RES,x)}
}
print("Done!")
#There is a CERT type written out as "NA" R is reading this a NA value. Changing these to NAC. Skipping sub certs which inherit there type from the parent certificate.
RES[is.na(RES$CERT_TYPE),"CERT_TYPE"] <- "NAC"
##Fill in missing values of the sub cert and cert parent indicators.
RES$SUB_CERT <- as.logical(!is.na(RES$SUB_CERT))
RES$HAS_SUB_CERT <- as.logical(!is.na(RES$HAS_SUB_CERT))
####Create a column called MAIN_CERT that is the parent, in case a record is a sub cert.
#Filter out all records that are a SUB_CERT, find the last instance of a "-" and pull out the certificate number only up to that point.
RES_SUB_CERTS <- RES %>% filter(SUB_CERT) %>% mutate(MAIN_CERT=substr(CERT_NUM,1,tail(unlist(gregexpr('-', CERT_NUM))-1, n=1)) )
#For all records that are not a subcert make the MAIN_CERT equal to the CERT_NUM. This data will be filtered in the next step so that only non-subcerts remain.
RES$MAIN_CERT <- RES$CERT_NUM
#Combine the data that is not a sub_cert with the data that is a sub_cert, clean the order of selection.
RES <- RES %>% filter(!SUB_CERT) %>% rbind(RES_SUB_CERTS) %>% select(MAIN_CERT,everything())
#Apply Cert values to child certifications
RES <- RES %>% group_by(MAIN_CERT) %>% mutate(CERT_TYPE=max(CERT_TYPE,na.rm=TRUE),CERT_STATUS=max(CERT_STATUS,na.rm=TRUE),CO_NAME=max(CO_NAME,na.rm=TRUE),ABB=ifelse(n()!=sum(is.na(ABB)),max(ABB,na.rm=TRUE),NA),CO_ADDR=ifelse(n()!=sum(is.na(CO_ADDR)),max(CO_ADDR,na.rm=TRUE),NA),DIV_NAME=ifelse(n()!=sum(is.na(DIV_NAME)),max(DIV_NAME,na.rm=TRUE),NA)) %>% ungroup
#Drop if there is no Address (this seems rare but a couple were found)
RES <- RES %>% filter(!is.na(CO_ADDR))
RES$AUTH_DATE <- as.Date(RES$AUTH_DATE,'%m/%d/%Y')
RES$ORIG_AUTH_DATE <- as.Date(RES$ORIG_AUTH_DATE,'%m/%d/%Y')
RES$EXP_DATE <- as.Date(RES$EXP_DATE,'%m/%d/%Y')
RES$EXT_DATE <- as.Date(RES$EXT_DATE,'%m/%d/%Y')
colnames(RES)
TBL <- rbind(read_csv("Data/Raw_Data/Table_Data/ASME_Active_Certificate_Table_Data.csv"),read_csv("Data/Raw_Data/Table_Data/ASME_Terminated_Certificate_Table_Data.csv")) %>% rename(CO_NAME2=`Company Name`,DIV_NAME2=`Division Name`,ABB2=Abbrev.,PLANT_ADDRESS=`Plant Address`,STATE2=`State/Province`,CERT_TYPE2=Type,CERT_STATUS2=Status,COUNTRY=`Country/Region`,MAIN_CERT=Certificate) %>% unique
UPDATE <- read_csv("Data/Raw_Data/Table_Data/Updated_Data.csv")[,-1] %>% rename(CO_NAME2=`Company Name`,DIV_NAME2=`Division Name`,ABB2=Abbrev.,PLANT_ADDRESS=`Plant Address`,STATE2=`State/Province`,CERT_TYPE2=Type,CERT_STATUS2=Status,COUNTRY=`Country/Region`,MAIN_CERT=Certificate) %>% unique
TBL <- TBL %>% filter(!(MAIN_CERT %in% (UPDATE %>% pull(MAIN_CERT))) )
TBL <- rbind(TBL,UPDATE)
RES_NAMES <- RES %>% select(MAIN_CERT,CO_NAME)
DROP <- TBL %>% unique %>% group_by(MAIN_CERT) %>% filter(n()>1) %>% arrange(MAIN_CERT) %>% left_join(RES_NAMES) %>% select(MAIN_CERT,CO_NAME,CO_NAME2) %>% mutate(PREFER=ifelse(toupper(CO_NAME)==toupper(CO_NAME2),1,0)) %>% filter(PREFER==0) %>% select(MAIN_CERT,CO_NAME2) %>% ungroup
TBL <- TBL %>% anti_join(DROP) %>% unique
MISSING_IN_TBL <- RES %>% full_join(TBL) %>% filter(is.na(CO_NAME2)) %>% select(CO_NAME) %>% unique
EXISTING_CERTS <- TBL %>% pull(MAIN_CERT)
#
UPDATE_TBL <- read_csv("Data/Raw_Data/Table_Data/Updated_Missing_Data.csv") %>% rename(CO_NAME2=`Company Name`,DIV_NAME2=`Division Name`,ABB2=Abbrev.,PLANT_ADDRESS=`Plant Address`,STATE2=`State/Province`,CERT_TYPE2=Type,CERT_STATUS2=Status,COUNTRY=`Country/Region`,MAIN_CERT=Certificate) %>% unique %>% filter(!(MAIN_CERT %in% EXISTING_CERTS))
TBL <- rbind(TBL,UPDATE_TBL)
#
EXISTING_CERTS <- TBL %>% pull(MAIN_CERT)
UPDATE_TBL <- read_csv("Data/Raw_Data/Table_Data/Updated_Missing_Data_2.csv")[,-1] %>% rename(CO_NAME2=`Company Name`,DIV_NAME2=`Division Name`,ABB2=Abbrev.,PLANT_ADDRESS=`Plant Address`,STATE2=`State/Province`,CERT_TYPE2=Type,CERT_STATUS2=Status,COUNTRY=`Country/Region`,MAIN_CERT=Certificate) %>% unique %>% filter(!(MAIN_CERT %in% EXISTING_CERTS))
TBL <- rbind(TBL,UPDATE_TBL)
#
EXISTING_CERTS <- TBL %>% pull(MAIN_CERT)
UPDATE_TBL <- read_csv("Data/Raw_Data/Table_Data/Updated_Missing_Data3.csv")[,-1] %>% rename(CO_NAME2=`Company Name`,DIV_NAME2=`Division Name`,ABB2=Abbrev.,PLANT_ADDRESS=`Plant Address`,STATE2=`State/Province`,CERT_TYPE2=Type,CERT_STATUS2=Status,COUNTRY=`Country/Region`,MAIN_CERT=Certificate) %>% unique %>% filter(!(MAIN_CERT %in% EXISTING_CERTS))
TBL <- rbind(TBL,UPDATE_TBL)
MISSING_IN_TBL <- RES %>% full_join(TBL) %>% filter(is.na(CO_NAME2)) %>% select(CO_NAME) %>% unique
MISSING_IN_PDFS <- RES %>% full_join(TBL) %>% filter(is.na(CO_NAME)) %>% select(CO_NAME2) %>% unique
write_csv(MISSING_IN_TBL,"Missing_From_Table.csv")
write_csv(MISSING_IN_PDFS,"Missing_From_PDS.csv")
RES <- RES %>% select(MAIN_CERT,CERT_NUM,CERT_TYPE,COUNTRY,STATE=STATE2,CITY=City,CERT_TYPE,CERT_STATUS,ORIG_AUTH_DATE,AUTH_DATE,EXP_DATE,CO_NAME=CO_NAME2,PLANT_ADDRESS,SCOPE)
RES$CERT_TYPE <- ifelse(RES$CERT_TYPE=='NAC (Quality Assurance Program)',"NUA",RES$CERT_TYPE)
RES$CERT_TYPE <- ifelse(RES$CERT_TYPE=='NPT (Quality Assurance Program)',"NPT",RES$CERT_TYPE)
RES <- RES %>% group_by(MAIN_CERT) %>% mutate(NUM_SUB_CERT=n()-1) %>% ungroup
RES <- RES %>% mutate(ORIG_AUTH_YEAR=year(ORIG_AUTH_DATE),AUTH_YEAR=year(AUTH_DATE),ORIG_AUTH_MONTH=year(ORIG_AUTH_DATE),AUTH_MONTH=year(AUTH_DATE))
#Add Nuclear indicator (all license with a nuclear related certificate)
RES <- RES %>% mutate(NUCLEAR=CERT_TYPE %in%c("N","N3","NAC","NPT","NS","NUA","NV","OWN","G","GC","MO"))
#RES <- RES %>% filter(CERT_TYPE!="OWN",CERT_TYPE!="NAC")
##SAVE
saveRDS(RES,"./Data/PROCESSED_DATA/RDS/ASME.Rds")
write_csv(RES,"./Data/PROCESSED_DATA/CSV/ASME.csv",quote="all")