Updated data, and cleaning
This commit is contained in:
parent
cb4028aa19
commit
d721637411
3
Proc.sh
3
Proc.sh
@ -1,4 +1,5 @@
|
|||||||
pdfunite Data/First_Run/*.pdf out.pdf
|
ulimit -n 4096
|
||||||
|
pdfunite Data/Raw_Data/Complete_PDF_Data/*.pdf out.pdf
|
||||||
pdftotext "out.pdf" - > out.txt
|
pdftotext "out.pdf" - > out.txt
|
||||||
rm out.pdf
|
rm out.pdf
|
||||||
a=$(cat out.txt)
|
a=$(cat out.txt)
|
||||||
|
|||||||
73
R_Clean.r
73
R_Clean.r
@ -22,7 +22,7 @@ for(x in 1:nrow(temp)){
|
|||||||
colnames(DF) <- gsub("DIVISION","DIV",gsub("EXTENSION","EXT",gsub("CERTIFICATION","CERT",gsub("NUMBER","NUM",gsub("EXPIRATION","EXP",gsub("AUTHORIZED","AUTH",gsub("ORIGINAL","ORIG",gsub("ADDRESS","ADDR",gsub("COMPANY","CO",gsub("ABBREVIATION","ABB",gsub("CERTIFICATE","CERT",gsub("ORIGINAL AUTHORIZED DATE","ORIG_AUTH_DATE",gsub(" ","_",toupper(c(TWO[x,1:(MX_COL)] ) ))))))))))))))
|
colnames(DF) <- gsub("DIVISION","DIV",gsub("EXTENSION","EXT",gsub("CERTIFICATION","CERT",gsub("NUMBER","NUM",gsub("EXPIRATION","EXP",gsub("AUTHORIZED","AUTH",gsub("ORIGINAL","ORIG",gsub("ADDRESS","ADDR",gsub("COMPANY","CO",gsub("ABBREVIATION","ABB",gsub("CERTIFICATE","CERT",gsub("ORIGINAL AUTHORIZED DATE","ORIG_AUTH_DATE",gsub(" ","_",toupper(c(TWO[x,1:(MX_COL)] ) ))))))))))))))
|
||||||
|
|
||||||
if("SCOPE_STATEMENT" %in% colnames(DF) ){
|
if("SCOPE_STATEMENT" %in% colnames(DF) ){
|
||||||
DF <- DF %>% rename(SCOPE=SCOPE_STATEMENT)
|
try(DF <- DF %>% rename(SCOPE=SCOPE_STATEMENT))
|
||||||
}
|
}
|
||||||
|
|
||||||
STORE[[length(STORE)+1]] <- DF
|
STORE[[length(STORE)+1]] <- DF
|
||||||
@ -31,6 +31,7 @@ for(x in STORE){
|
|||||||
print(x)
|
print(x)
|
||||||
if(!exists("RES")){RES <- x} else{RES <- full_join(RES,x)}
|
if(!exists("RES")){RES <- x} else{RES <- full_join(RES,x)}
|
||||||
}
|
}
|
||||||
|
print("Done!")
|
||||||
#There is a CERT type written out as "NA" R is reading this a NA value. Changing these to NAC. Skipping sub certs which inherit there type from the parent certificate.
|
#There is a CERT type written out as "NA" R is reading this a NA value. Changing these to NAC. Skipping sub certs which inherit there type from the parent certificate.
|
||||||
RES[is.na(RES$CERT_TYPE),"CERT_TYPE"] <- "NAC"
|
RES[is.na(RES$CERT_TYPE),"CERT_TYPE"] <- "NAC"
|
||||||
##Fill in missing values of the sub cert and cert parent indicators.
|
##Fill in missing values of the sub cert and cert parent indicators.
|
||||||
@ -45,26 +46,76 @@ RES$MAIN_CERT <- RES$CERT_NUM
|
|||||||
RES <- RES %>% filter(!SUB_CERT) %>% rbind(RES_SUB_CERTS) %>% select(MAIN_CERT,everything())
|
RES <- RES %>% filter(!SUB_CERT) %>% rbind(RES_SUB_CERTS) %>% select(MAIN_CERT,everything())
|
||||||
#Apply Cert values to child certifications
|
#Apply Cert values to child certifications
|
||||||
RES <- RES %>% group_by(MAIN_CERT) %>% mutate(CERT_TYPE=max(CERT_TYPE,na.rm=TRUE),CERT_STATUS=max(CERT_STATUS,na.rm=TRUE),CO_NAME=max(CO_NAME,na.rm=TRUE),ABB=ifelse(n()!=sum(is.na(ABB)),max(ABB,na.rm=TRUE),NA),CO_ADDR=ifelse(n()!=sum(is.na(CO_ADDR)),max(CO_ADDR,na.rm=TRUE),NA),DIV_NAME=ifelse(n()!=sum(is.na(DIV_NAME)),max(DIV_NAME,na.rm=TRUE),NA)) %>% ungroup
|
RES <- RES %>% group_by(MAIN_CERT) %>% mutate(CERT_TYPE=max(CERT_TYPE,na.rm=TRUE),CERT_STATUS=max(CERT_STATUS,na.rm=TRUE),CO_NAME=max(CO_NAME,na.rm=TRUE),ABB=ifelse(n()!=sum(is.na(ABB)),max(ABB,na.rm=TRUE),NA),CO_ADDR=ifelse(n()!=sum(is.na(CO_ADDR)),max(CO_ADDR,na.rm=TRUE),NA),DIV_NAME=ifelse(n()!=sum(is.na(DIV_NAME)),max(DIV_NAME,na.rm=TRUE),NA)) %>% ungroup
|
||||||
|
|
||||||
#Drop if there is no Address (this seems rare but a couple were found)
|
#Drop if there is no Address (this seems rare but a couple were found)
|
||||||
RES <- RES %>% filter(!is.na(CO_ADDR))
|
RES <- RES %>% filter(!is.na(CO_ADDR))
|
||||||
RES$AUTH_DATE <- as.Date(RES$AUTH_DATE,'%m/%d/%Y')
|
RES$AUTH_DATE <- as.Date(RES$AUTH_DATE,'%m/%d/%Y')
|
||||||
RES$ORIG_AUTH_DATE <- as.Date(RES$ORIG_AUTH_DATE,'%m/%d/%Y')
|
RES$ORIG_AUTH_DATE <- as.Date(RES$ORIG_AUTH_DATE,'%m/%d/%Y')
|
||||||
RES$EXP_DATE <- as.Date(RES$EXP_DATE,'%m/%d/%Y')
|
RES$EXP_DATE <- as.Date(RES$EXP_DATE,'%m/%d/%Y')
|
||||||
RES$EXT_DATE <- as.Date(RES$EXT_DATE,'%m/%d/%Y')
|
RES$EXT_DATE <- as.Date(RES$EXT_DATE,'%m/%d/%Y')
|
||||||
#RES$CO_NAME <- ifelse(RES$CO_NAME=="AREA Energy, Inc","AREA Energy Inc.",RES$CO_NAME)
|
colnames(RES)
|
||||||
#RES$CO_NAME <- ifelse(RES$CO_NAME %in% c("BWXT Canada Ltd.","BWXT Nuclear Energy, Inc.","Babcock & Wilcox Nuclear Energy Inc","Babcock & Wilcox, Nuclear Operations Group, Inc.- Euclid Operations"),"BWTX",RES$CO_NAME)
|
TBL <- rbind(read_csv("Data/Raw_Data/Table_Data/ASME_Active_Certificate_Table_Data.csv"),read_csv("Data/Raw_Data/Table_Data/ASME_Terminated_Certificate_Table_Data.csv")) %>% rename(CO_NAME2=`Company Name`,DIV_NAME2=`Division Name`,ABB2=Abbrev.,PLANT_ADDRESS=`Plant Address`,STATE2=`State/Province`,CERT_TYPE2=Type,CERT_STATUS2=Status,COUNTRY=`Country/Region`,MAIN_CERT=Certificate) %>% unique
|
||||||
|
|
||||||
|
UPDATE <- read_csv("Data/Raw_Data/Table_Data/Updated_Data.csv")[,-1] %>% rename(CO_NAME2=`Company Name`,DIV_NAME2=`Division Name`,ABB2=Abbrev.,PLANT_ADDRESS=`Plant Address`,STATE2=`State/Province`,CERT_TYPE2=Type,CERT_STATUS2=Status,COUNTRY=`Country/Region`,MAIN_CERT=Certificate) %>% unique
|
||||||
|
TBL <- TBL %>% filter(!(MAIN_CERT %in% (UPDATE %>% pull(MAIN_CERT))) )
|
||||||
|
TBL <- rbind(TBL,UPDATE)
|
||||||
|
RES_NAMES <- RES %>% select(MAIN_CERT,CO_NAME)
|
||||||
|
|
||||||
|
DROP <- TBL %>% unique %>% group_by(MAIN_CERT) %>% filter(n()>1) %>% arrange(MAIN_CERT) %>% left_join(RES_NAMES) %>% select(MAIN_CERT,CO_NAME,CO_NAME2) %>% mutate(PREFER=ifelse(toupper(CO_NAME)==toupper(CO_NAME2),1,0)) %>% filter(PREFER==0) %>% select(MAIN_CERT,CO_NAME2) %>% ungroup
|
||||||
|
TBL <- TBL %>% anti_join(DROP) %>% unique
|
||||||
|
##########Change later
|
||||||
|
TBL <- TBL %>% unique %>% group_by(MAIN_CERT) %>% filter(n()==1)
|
||||||
|
####
|
||||||
|
RES %>% full_join(TBL) %>% filter(toupper(CO_NAME)!=toupper(CO_NAME2))%>% select(MAIN_CERT, CO_NAME,CO_NAME2) %>% filter(!(MAIN_CERT %in% c(58279,'QSC-580','QSC-852','N-4584','43305',38008,39360,61004,62372,62850,60884,60883,16963,42365,59238,60377,60677,60885,61003,61084,61248,61293,61992,62255,62373,62469,62593,62727,54539,'N-3796',40267,34632,46096,47226,51803,52376,47225,48967,39361,41570,47224,41583,41052,52912,52960,62154))) %>% select(MAIN_CERT,CO_NAME,CO_NAME2)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
MISSING_IN_TBL <- RES %>% full_join(TBL) %>% filter(is.na(CO_NAME2)) %>% select(CO_NAME) %>% unique
|
||||||
|
write_csv(MISSING_IN_TBL,"Missing_From_Table.csv")
|
||||||
|
EXISTING_CERTS <- TBL %>% pull(MAIN_CERT)
|
||||||
|
#
|
||||||
|
UPDATE_TBL <- read_csv("Data/Raw_Data/Table_Data/Updated_Missing_Data.csv") %>% rename(CO_NAME2=`Company Name`,DIV_NAME2=`Division Name`,ABB2=Abbrev.,PLANT_ADDRESS=`Plant Address`,STATE2=`State/Province`,CERT_TYPE2=Type,CERT_STATUS2=Status,COUNTRY=`Country/Region`,MAIN_CERT=Certificate) %>% unique %>% filter(!(MAIN_CERT %in% EXISTING_CERTS))
|
||||||
|
TBL <- rbind(TBL,UPDATE_TBL)
|
||||||
|
MISSING_IN_TBL <- RES %>% full_join(TBL) %>% filter(is.na(CO_NAME2)) %>% select(CO_NAME) %>% unique
|
||||||
|
write_csv(MISSING_IN_TBL,"Missing_From_Table.csv")
|
||||||
|
#
|
||||||
|
EXISTING_CERTS <- TBL %>% pull(MAIN_CERT)
|
||||||
|
UPDATE_TBL <- read_csv("Data/Raw_Data/Table_Data/Updated_Missing_Data_2.csv") %>% rename(CO_NAME2=`Company Name`,DIV_NAME2=`Division Name`,ABB2=Abbrev.,PLANT_ADDRESS=`Plant Address`,STATE2=`State/Province`,CERT_TYPE2=Type,CERT_STATUS2=Status,COUNTRY=`Country/Region`,MAIN_CERT=Certificate) %>% unique %>% filter(!(MAIN_CERT %in% EXISTING_CERTS))
|
||||||
|
|
||||||
|
MISSING_IN_TBL <- RES %>% full_join(TBL) %>% filter(is.na(CO_NAME2)) %>% select(CO_NAME) %>% unique
|
||||||
|
write_csv(MISSING_IN_TBL,"Missing_From_Table.csv")
|
||||||
|
#
|
||||||
|
EXISTING_CERTS <- TBL %>% pull(MAIN_CERT)
|
||||||
|
UPDATE_TBL <- read_csv("Data/Raw_Data/Table_Data/Updated_Missing_Data_3.csv") %>% rename(CO_NAME2=`Company Name`,DIV_NAME2=`Division Name`,ABB2=Abbrev.,PLANT_ADDRESS=`Plant Address`,STATE2=`State/Province`,CERT_TYPE2=Type,CERT_STATUS2=Status,COUNTRY=`Country/Region`,MAIN_CERT=Certificate) %>% unique %>% filter(!(MAIN_CERT %in% EXISTING_CERTS))
|
||||||
|
TBL <- rbind(TBL,UPDATE_TBL)
|
||||||
|
|
||||||
|
MISSING_IN_TBL <- RES %>% full_join(TBL) %>% filter(is.na(CO_NAME2)) %>% select(CO_NAME) %>% unique
|
||||||
|
RES %>% full_join(TBL) %>% filter(is.na(CO_NAME2)) %>% select(CO_NAME) %>% unique %>% print(100)
|
||||||
|
write_csv(MISSING_IN_TBL,"Missing_From_Table.csv")
|
||||||
|
#
|
||||||
|
EXISTING_CERTS <- TBL %>% pull(MAIN_CERT)
|
||||||
|
UPDATE_TBL <- read_csv("Data/Raw_Data/Table_Data/Updated_Missing_Data_4.csv") %>% rename(CO_NAME2=`Company Name`,DIV_NAME2=`Division Name`,ABB2=Abbrev.,PLANT_ADDRESS=`Plant Address`,STATE2=`State/Province`,CERT_TYPE2=Type,CERT_STATUS2=Status,COUNTRY=`Country/Region`,MAIN_CERT=Certificate) %>% unique %>% filter(!(MAIN_CERT %in% EXISTING_CERTS))
|
||||||
|
TBL <- rbind(TBL,UPDATE_TBL)
|
||||||
|
|
||||||
|
MISSING_IN_TBL <- RES %>% full_join(TBL) %>% filter(is.na(CO_NAME2)) %>% select(CO_NAME) %>% unique
|
||||||
|
MISSING_IN_PDFS <- RES %>% full_join(TBL) %>% filter(is.na(CO_NAME)) %>% select(CO_NAME2) %>% unique
|
||||||
|
|
||||||
|
write_csv(MISSING_IN_TBL,"Missing_From_Table.csv")
|
||||||
|
write_csv(MISSING_IN_PDFS,"Missing_From_PDS.csv")
|
||||||
|
RES %>% full_join(TBL) %>% filter(is.na(CO_NAME)) %>% select(CO_NAME2) %>% unique
|
||||||
|
RES <- RES %>% inner_join(TBL)
|
||||||
|
nrow(RES)/14854
|
||||||
|
|
||||||
|
RES <- RES %>% select(MAIN_CERT,CERT_NUM,CERT_TYPE,COUNTRY,STATE=STATE,CITY=City,CERT_TYPE,CERT_STATUS,ORIG_AUTH_DATE,AUTH_DATE,EXP_DATE,CO_NAME=CO_NAME2,PLANT_ADDRESS,SCOPE)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
COMBINE <- function(WORD,NEW_NAME=NA,DF=RES){
|
|
||||||
WORD_SIZE<- nchar(WORD)
|
|
||||||
if(is.na(NEW_NAME)){NEW_NAME=WORD}
|
|
||||||
DF[toupper((substr(RES$CO_NAME,1,WORD_SIZE)))==toupper(WORD),5] <-NEW_NAME
|
|
||||||
return(DF)
|
|
||||||
}
|
|
||||||
RES$CERT_TYPE <- ifelse(RES$CERT_TYPE=='NAC (Quality Assurance Program)',"NUA",RES$CERT_TYPE)
|
RES$CERT_TYPE <- ifelse(RES$CERT_TYPE=='NAC (Quality Assurance Program)',"NUA",RES$CERT_TYPE)
|
||||||
RES$CERT_TYPE <- ifelse(RES$CERT_TYPE=='NPT (Quality Assurance Program)',"NPT",RES$CERT_TYPE)
|
RES$CERT_TYPE <- ifelse(RES$CERT_TYPE=='NPT (Quality Assurance Program)',"NPT",RES$CERT_TYPE)
|
||||||
RES <- RES %>% group_by(MAIN_CERT) %>% mutate(NUM_SUB_CERT=n()-1) %>% ungroup
|
RES <- RES %>% group_by(MAIN_CERT) %>% mutate(NUM_SUB_CERT=n()-1) %>% ungroup
|
||||||
RES$STATE <- NA
|
RES %>% pull(COUNTRY) %>% unique
|
||||||
|
RES %>% filter(COUNTRY=='United States') %>% pull(STATE)
|
||||||
for(x in 1:length(state.name)){
|
for(x in 1:length(state.name)){
|
||||||
#Reversing the order of state names prefers other states over "Washington" many addresses are on "Washington St" so this maximizes the correct matches.
|
#Reversing the order of state names prefers other states over "Washington" many addresses are on "Washington St" so this maximizes the correct matches.
|
||||||
RES$STATE[grepl(toupper(rev(state.name))[x],toupper(RES$CO_ADDR) )] <- rev(state.abb)[x]
|
RES$STATE[grepl(toupper(rev(state.name))[x],toupper(RES$CO_ADDR) )] <- rev(state.abb)[x]
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user