library(tidyverse) #SAVE FOLDERS if(!file.exists("./Data/PROCESSED_DATA/RDS")){dir.create(path="./Data/PROCESSED_DATA/RDS",recursive=TRUE)} if(!file.exists("./Data/PROCESSED_DATA/CSV")){dir.create(path="./Data/PROCESSED_DATA/CSV",recursive=TRUE)} temp <- read_delim("Data/PROCESSED_DATA/temp.tsv",trim_ws=TRUE,col_names=FALSE,delim="~") problems()$expected %>% unique temp <- temp[-1,] ONE <- temp[,c(!rep(c(TRUE,FALSE),ncol(temp)/2))] TWO <- temp[,c(rep(c(TRUE,FALSE),ncol(temp)/2))] STORE <- list() min(which(is.na(ONE[1,]))-1) for(x in 1:nrow(temp)){ #Some shortcuts used. Generally this should be one minus the min value. However some entries have the first column as a NA, removing the first column fixes both issues. MX_COL <- min(which(is.na(ONE[x,-1]))) DF <- ONE[x,1:MX_COL] colnames(DF) <- gsub("DIVISION","DIV",gsub("EXTENSION","EXT",gsub("CERTIFICATION","CERT",gsub("NUMBER","NUM",gsub("EXPIRATION","EXP",gsub("AUTHORIZED","AUTH",gsub("ORIGINAL","ORIG",gsub("ADDRESS","ADDR",gsub("COMPANY","CO",gsub("ABBREVIATION","ABB",gsub("CERTIFICATE","CERT",gsub("ORIGINAL AUTHORIZED DATE","ORIG_AUTH_DATE",gsub(" ","_",toupper(c(TWO[x,1:(MX_COL)] ) )))))))))))))) if("SCOPE_STATEMENT" %in% colnames(DF) ){ try(DF <- DF %>% rename(SCOPE=SCOPE_STATEMENT)) } STORE[[length(STORE)+1]] <- DF } for(x in STORE){ print(x) if(!exists("RES")){RES <- x} else{RES <- full_join(RES,x)} } print("Done!") #There is a CERT type written out as "NA" R is reading this a NA value. Changing these to NAC. Skipping sub certs which inherit there type from the parent certificate. RES[is.na(RES$CERT_TYPE),"CERT_TYPE"] <- "NAC" ##Fill in missing values of the sub cert and cert parent indicators. RES$SUB_CERT <- as.logical(!is.na(RES$SUB_CERT)) RES$HAS_SUB_CERT <- as.logical(!is.na(RES$HAS_SUB_CERT)) ####Create a column called MAIN_CERT that is the parent, in case a record is a sub cert. #Filter out all records that are a SUB_CERT, find the last instance of a "-" and pull out the certificate number only up to that point. RES_SUB_CERTS <- RES %>% filter(SUB_CERT) %>% mutate(MAIN_CERT=substr(CERT_NUM,1,tail(unlist(gregexpr('-', CERT_NUM))-1, n=1)) ) #For all records that are not a subcert make the MAIN_CERT equal to the CERT_NUM. This data will be filtered in the next step so that only non-subcerts remain. RES$MAIN_CERT <- RES$CERT_NUM #Combine the data that is not a sub_cert with the data that is a sub_cert, clean the order of selection. RES <- RES %>% filter(!SUB_CERT) %>% rbind(RES_SUB_CERTS) %>% select(MAIN_CERT,everything()) #Apply Cert values to child certifications RES <- RES %>% group_by(MAIN_CERT) %>% mutate(CERT_TYPE=max(CERT_TYPE,na.rm=TRUE),CERT_STATUS=max(CERT_STATUS,na.rm=TRUE),CO_NAME=max(CO_NAME,na.rm=TRUE),ABB=ifelse(n()!=sum(is.na(ABB)),max(ABB,na.rm=TRUE),NA),CO_ADDR=ifelse(n()!=sum(is.na(CO_ADDR)),max(CO_ADDR,na.rm=TRUE),NA),DIV_NAME=ifelse(n()!=sum(is.na(DIV_NAME)),max(DIV_NAME,na.rm=TRUE),NA)) %>% ungroup #Drop if there is no Address (this seems rare but a couple were found) RES <- RES %>% filter(!is.na(CO_ADDR)) RES$AUTH_DATE <- as.Date(RES$AUTH_DATE,'%m/%d/%Y') RES$ORIG_AUTH_DATE <- as.Date(RES$ORIG_AUTH_DATE,'%m/%d/%Y') RES$EXP_DATE <- as.Date(RES$EXP_DATE,'%m/%d/%Y') RES$EXT_DATE <- as.Date(RES$EXT_DATE,'%m/%d/%Y') colnames(RES) TBL <- rbind(read_csv("Data/Raw_Data/Table_Data/ASME_Active_Certificate_Table_Data.csv"),read_csv("Data/Raw_Data/Table_Data/ASME_Terminated_Certificate_Table_Data.csv")) %>% rename(CO_NAME2=`Company Name`,DIV_NAME2=`Division Name`,ABB2=Abbrev.,PLANT_ADDRESS=`Plant Address`,STATE2=`State/Province`,CERT_TYPE2=Type,CERT_STATUS2=Status,COUNTRY=`Country/Region`,MAIN_CERT=Certificate) %>% unique UPDATE <- read_csv("Data/Raw_Data/Table_Data/Updated_Data.csv")[,-1] %>% rename(CO_NAME2=`Company Name`,DIV_NAME2=`Division Name`,ABB2=Abbrev.,PLANT_ADDRESS=`Plant Address`,STATE2=`State/Province`,CERT_TYPE2=Type,CERT_STATUS2=Status,COUNTRY=`Country/Region`,MAIN_CERT=Certificate) %>% unique TBL <- TBL %>% filter(!(MAIN_CERT %in% (UPDATE %>% pull(MAIN_CERT))) ) TBL <- rbind(TBL,UPDATE) RES_NAMES <- RES %>% select(MAIN_CERT,CO_NAME) DROP <- TBL %>% unique %>% group_by(MAIN_CERT) %>% filter(n()>1) %>% arrange(MAIN_CERT) %>% left_join(RES_NAMES) %>% select(MAIN_CERT,CO_NAME,CO_NAME2) %>% mutate(PREFER=ifelse(toupper(CO_NAME)==toupper(CO_NAME2),1,0)) %>% filter(PREFER==0) %>% select(MAIN_CERT,CO_NAME2) %>% ungroup TBL <- TBL %>% anti_join(DROP) %>% unique MISSING_IN_TBL <- RES %>% full_join(TBL) %>% filter(is.na(CO_NAME2)) %>% select(CO_NAME) %>% unique EXISTING_CERTS <- TBL %>% pull(MAIN_CERT) # UPDATE_TBL <- read_csv("Data/Raw_Data/Table_Data/Updated_Missing_Data.csv") %>% rename(CO_NAME2=`Company Name`,DIV_NAME2=`Division Name`,ABB2=Abbrev.,PLANT_ADDRESS=`Plant Address`,STATE2=`State/Province`,CERT_TYPE2=Type,CERT_STATUS2=Status,COUNTRY=`Country/Region`,MAIN_CERT=Certificate) %>% unique %>% filter(!(MAIN_CERT %in% EXISTING_CERTS)) TBL <- rbind(TBL,UPDATE_TBL) # EXISTING_CERTS <- TBL %>% pull(MAIN_CERT) UPDATE_TBL <- read_csv("Data/Raw_Data/Table_Data/Updated_Missing_Data_2.csv")[,-1] %>% rename(CO_NAME2=`Company Name`,DIV_NAME2=`Division Name`,ABB2=Abbrev.,PLANT_ADDRESS=`Plant Address`,STATE2=`State/Province`,CERT_TYPE2=Type,CERT_STATUS2=Status,COUNTRY=`Country/Region`,MAIN_CERT=Certificate) %>% unique %>% filter(!(MAIN_CERT %in% EXISTING_CERTS)) TBL <- rbind(TBL,UPDATE_TBL) # EXISTING_CERTS <- TBL %>% pull(MAIN_CERT) UPDATE_TBL <- read_csv("Data/Raw_Data/Table_Data/Updated_Missing_Data3.csv")[,-1] %>% rename(CO_NAME2=`Company Name`,DIV_NAME2=`Division Name`,ABB2=Abbrev.,PLANT_ADDRESS=`Plant Address`,STATE2=`State/Province`,CERT_TYPE2=Type,CERT_STATUS2=Status,COUNTRY=`Country/Region`,MAIN_CERT=Certificate) %>% unique %>% filter(!(MAIN_CERT %in% EXISTING_CERTS)) TBL <- rbind(TBL,UPDATE_TBL) MISSING_IN_TBL <- RES %>% full_join(TBL) %>% filter(is.na(CO_NAME2)) %>% select(CO_NAME) %>% unique MISSING_IN_PDFS <- RES %>% full_join(TBL) %>% filter(is.na(CO_NAME)) %>% select(CO_NAME2) %>% unique write_csv(MISSING_IN_TBL,"Missing_From_Table.csv") write_csv(MISSING_IN_PDFS,"Missing_From_PDS.csv") RES <- RES %>% select(MAIN_CERT,CERT_NUM,CERT_TYPE,COUNTRY,STATE=STATE2,CITY=City,CERT_TYPE,CERT_STATUS,ORIG_AUTH_DATE,AUTH_DATE,EXP_DATE,CO_NAME=CO_NAME2,PLANT_ADDRESS,SCOPE) RES$CERT_TYPE <- ifelse(RES$CERT_TYPE=='NAC (Quality Assurance Program)',"NUA",RES$CERT_TYPE) RES$CERT_TYPE <- ifelse(RES$CERT_TYPE=='NPT (Quality Assurance Program)',"NPT",RES$CERT_TYPE) RES <- RES %>% group_by(MAIN_CERT) %>% mutate(NUM_SUB_CERT=n()-1) %>% ungroup RES <- RES %>% mutate(ORIG_AUTH_YEAR=year(ORIG_AUTH_DATE),AUTH_YEAR=year(AUTH_DATE),ORIG_AUTH_MONTH=year(ORIG_AUTH_DATE),AUTH_MONTH=year(AUTH_DATE)) #Add Nuclear indicator (all license with a nuclear related certificate) RES <- RES %>% mutate(NUCLEAR=CERT_TYPE %in%c("N","N3","NAC","NPT","NS","NUA","NV","OWN","G","GC","MO")) #RES <- RES %>% filter(CERT_TYPE!="OWN",CERT_TYPE!="NAC") ##SAVE saveRDS(RES,"./Data/PROCESSED_DATA/RDS/ASME.Rds") write_csv(RES,"./Data/PROCESSED_DATA/CSV/ASME.csv",quote="all")