Compare commits
No commits in common. "d721637411c94c5c87e4bbaf988facb922195fa0" and "3af79e00d176adb754c6869ae7ab4f2581bef746" have entirely different histories.
d721637411
...
3af79e00d1
50
Proc.sh
50
Proc.sh
@ -1,50 +0,0 @@
|
|||||||
ulimit -n 4096
|
|
||||||
pdfunite Data/Raw_Data/Complete_PDF_Data/*.pdf out.pdf
|
|
||||||
pdftotext "out.pdf" - > out.txt
|
|
||||||
rm out.pdf
|
|
||||||
a=$(cat out.txt)
|
|
||||||
#No "~" were seen in the data but to be safe remove all "~' if they exist so "~" can be used as deliminator of data.
|
|
||||||
a=$(echo -e $a |sed 's/~/--/g')
|
|
||||||
#Remove all ":" that are a column indicator
|
|
||||||
a=$(echo -e $a |sed 's/Abbreviation:/~Abbreviation~/g')
|
|
||||||
a=$(echo -e $a |sed 's/Original Authorized Date:/~Original Authorized Date~/g')
|
|
||||||
a=$(echo -e $a |sed 's/Authorized Date:/~Authorized Date~/g')
|
|
||||||
a=$(echo -e $a |sed 's/Certificate Number:/~Certificate Number~/g')
|
|
||||||
a=$(echo -e $a |sed 's/Certificate Status:/~Certificate Status~/g')
|
|
||||||
a=$(echo -e $a |sed 's/Certificate Type:/Certificate Type~/g')
|
|
||||||
a=$(echo -e $a |sed 's/Company Address:/~Company Address~/g')
|
|
||||||
a=$(echo -e $a |sed 's/Company Name:/~Company Name~/g')
|
|
||||||
a=$(echo -e $a |sed 's/Division Name:/~Division Name~/g')
|
|
||||||
a=$(echo -e $a |sed 's/Expiration Date:/~Expiration Date~/g')
|
|
||||||
a=$(echo -e $a |sed 's/Extension Date:/~Extension Date~/g')
|
|
||||||
a=$(echo -e $a |sed 's/Scope:/~Scope~/g')
|
|
||||||
#"Sites:" indicates that there a multiple certificates tied to the main certificate. Add a indicator that this row of data contains multiple certificates and should be processed later in the "b" variable.
|
|
||||||
a=$(echo -e $a |sed 's/Sites:/HEAD_CERT/g')
|
|
||||||
a=$(echo -e $a |sed 's/Scope Statement:/~Scope Statement~/g')
|
|
||||||
a=$(echo -e $a | sed '/~/s/^/~/' |sed 's/$/ /' | tr -s "\n" | awk '{print}' ORS='' | sed 's/ CA Connect CERTIFICATE HOLDER DETAILS //g' | sed 's/CA Connect CERTIFICATE HOLDER DETAILS //g' | sed 's/ ~/~/g'| sed 's/~ /~/g')
|
|
||||||
|
|
||||||
########################################
|
|
||||||
#Clean up these lines using ~ as a deliminator and making the file look tidy.
|
|
||||||
a=$(echo -e $a | sed '/~/s/^/~/' |sed 's/$/ /' | tr -s "\n" | awk '{print}' ORS='' | sed 's/^L//g' | sed 's/ CA Connect CERTIFICATE HOLDER DETAILS //g' | sed 's/CA Connect CERTIFICATE HOLDER DETAILS //g' | sed 's/Sites:/HAS_SUB_CERT:TRUE~~~~Sites:/g' |sed 's/~~~~/\n/g' | sed 's/ ~/~/g'| sed 's/~ /~/g')
|
|
||||||
#Replace the cert type of NA, with NAC
|
|
||||||
a=$(echo -e $a |sed 's/Certificate Type~NA/Certificate Type~NAC/g')
|
|
||||||
#Add a header with many "~" that way R wont truncate the data when there are less rows than expected. R will do most of the manipulation of the data to put it into a standard table. The "Certificate Type" always begins a new entry, so add a new line after this.
|
|
||||||
a=$(echo -e $a| sed '1 s/^/~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~/' |sed 's/Certificate Type/\nCertificate Type/g' )
|
|
||||||
|
|
||||||
############################
|
|
||||||
#Some nuclear certificates have sub certificates that apply to certain things. Make B pull out these lines for separate processing
|
|
||||||
b=$(echo -e "$a" | grep HEAD_CERT | sed 's/^/HAS_SUB_CERT~TRUE~/' |sed 's/HEAD_CERT/\n/' )
|
|
||||||
b=$(echo -e "$b" | sed 's/Certification Number:/\nSUB_CERT~TRUE~Certificate Number~/g' |sed -r '/^\s*$/d'| sed 's/ ~/~/g'| sed 's/~ /~/g' )
|
|
||||||
a=$(echo -e "$a" | grep -v HEAD_CERT)
|
|
||||||
#Combine the two data types "a" being all normal certificates, "b" being all certificates that have/are sub certificates.
|
|
||||||
c=$(echo -e "${a}\n${b}")
|
|
||||||
#Remove any duplicate lines
|
|
||||||
c=$(echo -e "$c" |sort | uniq -u)
|
|
||||||
|
|
||||||
#Write to the final file for processing in R.
|
|
||||||
echo "$c" > ./Data/PROCESSED_DATA/temp.tsv
|
|
||||||
#rm ./PROCESSED_DATA/out.txt
|
|
||||||
#Rscript R-Scripts/R_Clean.r
|
|
||||||
#rm ./PROCESSED_DATA/temp.tsv
|
|
||||||
#Rscript R-Scripts/MAKE_REG_DAT.r
|
|
||||||
|
|
||||||
252
R_Clean.r
252
R_Clean.r
@ -1,252 +0,0 @@
|
|||||||
library(tidyverse)
|
|
||||||
library(zipcodeR)
|
|
||||||
require(usmap)
|
|
||||||
|
|
||||||
#SAVE FOLDERS
|
|
||||||
if(!file.exists("./Data/PROCESSED_DATA/RDS")){dir.create(path="./Data/PROCESSED_DATA/RDS",recursive=TRUE)}
|
|
||||||
if(!file.exists("./Data/PROCESSED_DATA/CSV")){dir.create(path="./Data/PROCESSED_DATA/CSV",recursive=TRUE)}
|
|
||||||
|
|
||||||
temp <- read_delim("Data/PROCESSED_DATA/temp.tsv",trim_ws=TRUE,col_names=FALSE,delim="~")
|
|
||||||
problems()$expected %>% unique
|
|
||||||
temp <- temp[-1,]
|
|
||||||
|
|
||||||
ONE <- temp[,c(!rep(c(TRUE,FALSE),ncol(temp)/2))]
|
|
||||||
TWO <- temp[,c(rep(c(TRUE,FALSE),ncol(temp)/2))]
|
|
||||||
|
|
||||||
STORE <- list()
|
|
||||||
min(which(is.na(ONE[1,]))-1)
|
|
||||||
for(x in 1:nrow(temp)){
|
|
||||||
#Some shortcuts used. Generally this should be one minus the min value. However some entries have the first column as a NA, removing the first column fixes both issues.
|
|
||||||
MX_COL <- min(which(is.na(ONE[x,-1])))
|
|
||||||
DF <- ONE[x,1:MX_COL]
|
|
||||||
colnames(DF) <- gsub("DIVISION","DIV",gsub("EXTENSION","EXT",gsub("CERTIFICATION","CERT",gsub("NUMBER","NUM",gsub("EXPIRATION","EXP",gsub("AUTHORIZED","AUTH",gsub("ORIGINAL","ORIG",gsub("ADDRESS","ADDR",gsub("COMPANY","CO",gsub("ABBREVIATION","ABB",gsub("CERTIFICATE","CERT",gsub("ORIGINAL AUTHORIZED DATE","ORIG_AUTH_DATE",gsub(" ","_",toupper(c(TWO[x,1:(MX_COL)] ) ))))))))))))))
|
|
||||||
|
|
||||||
if("SCOPE_STATEMENT" %in% colnames(DF) ){
|
|
||||||
try(DF <- DF %>% rename(SCOPE=SCOPE_STATEMENT))
|
|
||||||
}
|
|
||||||
|
|
||||||
STORE[[length(STORE)+1]] <- DF
|
|
||||||
}
|
|
||||||
for(x in STORE){
|
|
||||||
print(x)
|
|
||||||
if(!exists("RES")){RES <- x} else{RES <- full_join(RES,x)}
|
|
||||||
}
|
|
||||||
print("Done!")
|
|
||||||
#There is a CERT type written out as "NA" R is reading this a NA value. Changing these to NAC. Skipping sub certs which inherit there type from the parent certificate.
|
|
||||||
RES[is.na(RES$CERT_TYPE),"CERT_TYPE"] <- "NAC"
|
|
||||||
##Fill in missing values of the sub cert and cert parent indicators.
|
|
||||||
RES$SUB_CERT <- as.logical(!is.na(RES$SUB_CERT))
|
|
||||||
RES$HAS_SUB_CERT <- as.logical(!is.na(RES$HAS_SUB_CERT))
|
|
||||||
####Create a column called MAIN_CERT that is the parent, in case a record is a sub cert.
|
|
||||||
#Filter out all records that are a SUB_CERT, find the last instance of a "-" and pull out the certificate number only up to that point.
|
|
||||||
RES_SUB_CERTS <- RES %>% filter(SUB_CERT) %>% mutate(MAIN_CERT=substr(CERT_NUM,1,tail(unlist(gregexpr('-', CERT_NUM))-1, n=1)) )
|
|
||||||
#For all records that are not a subcert make the MAIN_CERT equal to the CERT_NUM. This data will be filtered in the next step so that only non-subcerts remain.
|
|
||||||
RES$MAIN_CERT <- RES$CERT_NUM
|
|
||||||
#Combine the data that is not a sub_cert with the data that is a sub_cert, clean the order of selection.
|
|
||||||
RES <- RES %>% filter(!SUB_CERT) %>% rbind(RES_SUB_CERTS) %>% select(MAIN_CERT,everything())
|
|
||||||
#Apply Cert values to child certifications
|
|
||||||
RES <- RES %>% group_by(MAIN_CERT) %>% mutate(CERT_TYPE=max(CERT_TYPE,na.rm=TRUE),CERT_STATUS=max(CERT_STATUS,na.rm=TRUE),CO_NAME=max(CO_NAME,na.rm=TRUE),ABB=ifelse(n()!=sum(is.na(ABB)),max(ABB,na.rm=TRUE),NA),CO_ADDR=ifelse(n()!=sum(is.na(CO_ADDR)),max(CO_ADDR,na.rm=TRUE),NA),DIV_NAME=ifelse(n()!=sum(is.na(DIV_NAME)),max(DIV_NAME,na.rm=TRUE),NA)) %>% ungroup
|
|
||||||
#Drop if there is no Address (this seems rare but a couple were found)
|
|
||||||
RES <- RES %>% filter(!is.na(CO_ADDR))
|
|
||||||
RES$AUTH_DATE <- as.Date(RES$AUTH_DATE,'%m/%d/%Y')
|
|
||||||
RES$ORIG_AUTH_DATE <- as.Date(RES$ORIG_AUTH_DATE,'%m/%d/%Y')
|
|
||||||
RES$EXP_DATE <- as.Date(RES$EXP_DATE,'%m/%d/%Y')
|
|
||||||
RES$EXT_DATE <- as.Date(RES$EXT_DATE,'%m/%d/%Y')
|
|
||||||
colnames(RES)
|
|
||||||
TBL <- rbind(read_csv("Data/Raw_Data/Table_Data/ASME_Active_Certificate_Table_Data.csv"),read_csv("Data/Raw_Data/Table_Data/ASME_Terminated_Certificate_Table_Data.csv")) %>% rename(CO_NAME2=`Company Name`,DIV_NAME2=`Division Name`,ABB2=Abbrev.,PLANT_ADDRESS=`Plant Address`,STATE2=`State/Province`,CERT_TYPE2=Type,CERT_STATUS2=Status,COUNTRY=`Country/Region`,MAIN_CERT=Certificate) %>% unique
|
|
||||||
|
|
||||||
UPDATE <- read_csv("Data/Raw_Data/Table_Data/Updated_Data.csv")[,-1] %>% rename(CO_NAME2=`Company Name`,DIV_NAME2=`Division Name`,ABB2=Abbrev.,PLANT_ADDRESS=`Plant Address`,STATE2=`State/Province`,CERT_TYPE2=Type,CERT_STATUS2=Status,COUNTRY=`Country/Region`,MAIN_CERT=Certificate) %>% unique
|
|
||||||
TBL <- TBL %>% filter(!(MAIN_CERT %in% (UPDATE %>% pull(MAIN_CERT))) )
|
|
||||||
TBL <- rbind(TBL,UPDATE)
|
|
||||||
RES_NAMES <- RES %>% select(MAIN_CERT,CO_NAME)
|
|
||||||
|
|
||||||
DROP <- TBL %>% unique %>% group_by(MAIN_CERT) %>% filter(n()>1) %>% arrange(MAIN_CERT) %>% left_join(RES_NAMES) %>% select(MAIN_CERT,CO_NAME,CO_NAME2) %>% mutate(PREFER=ifelse(toupper(CO_NAME)==toupper(CO_NAME2),1,0)) %>% filter(PREFER==0) %>% select(MAIN_CERT,CO_NAME2) %>% ungroup
|
|
||||||
TBL <- TBL %>% anti_join(DROP) %>% unique
|
|
||||||
##########Change later
|
|
||||||
TBL <- TBL %>% unique %>% group_by(MAIN_CERT) %>% filter(n()==1)
|
|
||||||
####
|
|
||||||
RES %>% full_join(TBL) %>% filter(toupper(CO_NAME)!=toupper(CO_NAME2))%>% select(MAIN_CERT, CO_NAME,CO_NAME2) %>% filter(!(MAIN_CERT %in% c(58279,'QSC-580','QSC-852','N-4584','43305',38008,39360,61004,62372,62850,60884,60883,16963,42365,59238,60377,60677,60885,61003,61084,61248,61293,61992,62255,62373,62469,62593,62727,54539,'N-3796',40267,34632,46096,47226,51803,52376,47225,48967,39361,41570,47224,41583,41052,52912,52960,62154))) %>% select(MAIN_CERT,CO_NAME,CO_NAME2)
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
MISSING_IN_TBL <- RES %>% full_join(TBL) %>% filter(is.na(CO_NAME2)) %>% select(CO_NAME) %>% unique
|
|
||||||
write_csv(MISSING_IN_TBL,"Missing_From_Table.csv")
|
|
||||||
EXISTING_CERTS <- TBL %>% pull(MAIN_CERT)
|
|
||||||
#
|
|
||||||
UPDATE_TBL <- read_csv("Data/Raw_Data/Table_Data/Updated_Missing_Data.csv") %>% rename(CO_NAME2=`Company Name`,DIV_NAME2=`Division Name`,ABB2=Abbrev.,PLANT_ADDRESS=`Plant Address`,STATE2=`State/Province`,CERT_TYPE2=Type,CERT_STATUS2=Status,COUNTRY=`Country/Region`,MAIN_CERT=Certificate) %>% unique %>% filter(!(MAIN_CERT %in% EXISTING_CERTS))
|
|
||||||
TBL <- rbind(TBL,UPDATE_TBL)
|
|
||||||
MISSING_IN_TBL <- RES %>% full_join(TBL) %>% filter(is.na(CO_NAME2)) %>% select(CO_NAME) %>% unique
|
|
||||||
write_csv(MISSING_IN_TBL,"Missing_From_Table.csv")
|
|
||||||
#
|
|
||||||
EXISTING_CERTS <- TBL %>% pull(MAIN_CERT)
|
|
||||||
UPDATE_TBL <- read_csv("Data/Raw_Data/Table_Data/Updated_Missing_Data_2.csv") %>% rename(CO_NAME2=`Company Name`,DIV_NAME2=`Division Name`,ABB2=Abbrev.,PLANT_ADDRESS=`Plant Address`,STATE2=`State/Province`,CERT_TYPE2=Type,CERT_STATUS2=Status,COUNTRY=`Country/Region`,MAIN_CERT=Certificate) %>% unique %>% filter(!(MAIN_CERT %in% EXISTING_CERTS))
|
|
||||||
|
|
||||||
MISSING_IN_TBL <- RES %>% full_join(TBL) %>% filter(is.na(CO_NAME2)) %>% select(CO_NAME) %>% unique
|
|
||||||
write_csv(MISSING_IN_TBL,"Missing_From_Table.csv")
|
|
||||||
#
|
|
||||||
EXISTING_CERTS <- TBL %>% pull(MAIN_CERT)
|
|
||||||
UPDATE_TBL <- read_csv("Data/Raw_Data/Table_Data/Updated_Missing_Data_3.csv") %>% rename(CO_NAME2=`Company Name`,DIV_NAME2=`Division Name`,ABB2=Abbrev.,PLANT_ADDRESS=`Plant Address`,STATE2=`State/Province`,CERT_TYPE2=Type,CERT_STATUS2=Status,COUNTRY=`Country/Region`,MAIN_CERT=Certificate) %>% unique %>% filter(!(MAIN_CERT %in% EXISTING_CERTS))
|
|
||||||
TBL <- rbind(TBL,UPDATE_TBL)
|
|
||||||
|
|
||||||
MISSING_IN_TBL <- RES %>% full_join(TBL) %>% filter(is.na(CO_NAME2)) %>% select(CO_NAME) %>% unique
|
|
||||||
RES %>% full_join(TBL) %>% filter(is.na(CO_NAME2)) %>% select(CO_NAME) %>% unique %>% print(100)
|
|
||||||
write_csv(MISSING_IN_TBL,"Missing_From_Table.csv")
|
|
||||||
#
|
|
||||||
EXISTING_CERTS <- TBL %>% pull(MAIN_CERT)
|
|
||||||
UPDATE_TBL <- read_csv("Data/Raw_Data/Table_Data/Updated_Missing_Data_4.csv") %>% rename(CO_NAME2=`Company Name`,DIV_NAME2=`Division Name`,ABB2=Abbrev.,PLANT_ADDRESS=`Plant Address`,STATE2=`State/Province`,CERT_TYPE2=Type,CERT_STATUS2=Status,COUNTRY=`Country/Region`,MAIN_CERT=Certificate) %>% unique %>% filter(!(MAIN_CERT %in% EXISTING_CERTS))
|
|
||||||
TBL <- rbind(TBL,UPDATE_TBL)
|
|
||||||
|
|
||||||
MISSING_IN_TBL <- RES %>% full_join(TBL) %>% filter(is.na(CO_NAME2)) %>% select(CO_NAME) %>% unique
|
|
||||||
MISSING_IN_PDFS <- RES %>% full_join(TBL) %>% filter(is.na(CO_NAME)) %>% select(CO_NAME2) %>% unique
|
|
||||||
|
|
||||||
write_csv(MISSING_IN_TBL,"Missing_From_Table.csv")
|
|
||||||
write_csv(MISSING_IN_PDFS,"Missing_From_PDS.csv")
|
|
||||||
RES %>% full_join(TBL) %>% filter(is.na(CO_NAME)) %>% select(CO_NAME2) %>% unique
|
|
||||||
RES <- RES %>% inner_join(TBL)
|
|
||||||
nrow(RES)/14854
|
|
||||||
|
|
||||||
RES <- RES %>% select(MAIN_CERT,CERT_NUM,CERT_TYPE,COUNTRY,STATE=STATE,CITY=City,CERT_TYPE,CERT_STATUS,ORIG_AUTH_DATE,AUTH_DATE,EXP_DATE,CO_NAME=CO_NAME2,PLANT_ADDRESS,SCOPE)
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
RES$CERT_TYPE <- ifelse(RES$CERT_TYPE=='NAC (Quality Assurance Program)',"NUA",RES$CERT_TYPE)
|
|
||||||
RES$CERT_TYPE <- ifelse(RES$CERT_TYPE=='NPT (Quality Assurance Program)',"NPT",RES$CERT_TYPE)
|
|
||||||
RES <- RES %>% group_by(MAIN_CERT) %>% mutate(NUM_SUB_CERT=n()-1) %>% ungroup
|
|
||||||
RES %>% pull(COUNTRY) %>% unique
|
|
||||||
RES %>% filter(COUNTRY=='United States') %>% pull(STATE)
|
|
||||||
for(x in 1:length(state.name)){
|
|
||||||
#Reversing the order of state names prefers other states over "Washington" many addresses are on "Washington St" so this maximizes the correct matches.
|
|
||||||
RES$STATE[grepl(toupper(rev(state.name))[x],toupper(RES$CO_ADDR) )] <- rev(state.abb)[x]
|
|
||||||
|
|
||||||
}
|
|
||||||
#Function to clean up bad matches, and add zip codes
|
|
||||||
FIX_STATE <- function(RES){
|
|
||||||
RES$STATE <- NA
|
|
||||||
RES$CO_ZIP_CODE <- NA
|
|
||||||
RES$COUNTRY <- NA
|
|
||||||
RES$CO_ZIP_CODE <- NA
|
|
||||||
for(x in 1:length(state.name)){
|
|
||||||
#Reversing the order of state names prefers other states over "Washington" many addresses are on "Washington St" so this maximizes the correct matches.
|
|
||||||
RES$STATE[grepl(toupper(rev(state.name))[x],toupper(RES$CO_ADDR) )] <- rev(state.abb)[x]
|
|
||||||
|
|
||||||
}
|
|
||||||
RES$COUNTRY[grepl(toupper("Puerto Rico"),toupper(RES$CO_ADDR) )] <- 'USA_OTHER'
|
|
||||||
RES$STATE[grepl(toupper("Puerto Rico"),toupper(RES$CO_ADDR) )] <- 'OTHER'
|
|
||||||
RES$STATE[grepl("ARKANSAS",toupper(RES$CO_ADDR) )] <- 'AR'
|
|
||||||
RES$STATE[grepl("WEST VIRGINIA",toupper(RES$CO_ADDR) )] <- 'WV'
|
|
||||||
RES[RES$CO_ADDR=='1106 Kansas Street Memphis Tennessee USA 38106',"STATE"] <- 'TN'
|
|
||||||
RES[RES$CO_ADDR=='5215 Arkansas Road Catoosa Oklahoma USA 74015',"STATE"] <- 'OK'
|
|
||||||
RES[RES$CO_ADDR=='1428 W. 9th Street Kansas City Missouri USA 64101',"STATE"] <- 'MO'
|
|
||||||
RES[RES$CO_ADDR=='1600 Warren Street North Kansas City Missouri USA 64116',"STATE"] <- 'MO'
|
|
||||||
RES[RES$CO_ADDR=='31 Maryland Avenue Paterson New Jersey USA 07503',"STATE"] <- 'NJ'
|
|
||||||
RES[RES$CO_ADDR=='327 North Maine Fallon Nevada USA 89406',"STATE"] <- 'NV'
|
|
||||||
RES[RES$CO_ADDR=='369 West Western Avenue Port Washington Wisconsin USA 53074-0993',"STATE"] <- 'WI'
|
|
||||||
RES[RES$CO_ADDR=='600 London Rd Delaware Ohio USA 43015',"STATE"] <- 'OH'
|
|
||||||
RES[RES$CO_ADDR=='801 Georgia Ave. Deer Park Texas USA 77536',"STATE"] <- 'TX'
|
|
||||||
RES[RES$CO_ADDR=='8116 Highway 166 Arkansas City Kansas USA 67005',"STATE"] <- 'KS'
|
|
||||||
RES[RES$CO_ADDR=='Calle Siete Sur No. 108 Ciudad Industrial-Nueva Tijuana Tijuana, Baja California MEX 22500',"STATE"] <-NA
|
|
||||||
RES[RES$CO_ADDR=='Carrera 22 No. 3A-37 La Virginia, Risaralda COL',"STATE"] <- NA
|
|
||||||
RES[RES$CO_ADDR=='Km 20 Via Cali - Florida Corregimiento La Regina Candelaria Valle del Cauca COL',"STATE"] <-NA
|
|
||||||
RES[RES$CO_ADDR=='1133 California Way Longview Washington USA 98632',"STATE"] <- 'WA'
|
|
||||||
RES[RES$CO_ADDR=='13800 Wyandotte St. Kansas City Missouri USA 64145-1518',"STATE"] <- 'MO'
|
|
||||||
RES[RES$CO_ADDR=='2307 Oregon Street Oshkosh Wisconsin USA 54902',"STATE"] <- 'WI'
|
|
||||||
RES[RES$CO_ADDR=='2538 W. Kentucky Ave. Pampa Texas USA 79065',"STATE"] <- 'TX'
|
|
||||||
RES[RES$CO_ADDR=='2611 Southwest Blvd. Kansas City Missouri USA 64108',"STATE"] <- 'MO'
|
|
||||||
RES[RES$CO_ADDR=='2905 Maryland Avenue North Versailles Pennsylvania USA 15137',"STATE"] <- 'PA'
|
|
||||||
RES[RES$CO_ADDR=='307 Mississippi Ave Wichita Falls Texas USA 76301',"STATE"] <- 'TX'
|
|
||||||
RES[RES$CO_ADDR=='35006 Washington Avenue Honey Creek Wisconsin USA 53138',"STATE"] <- 'WI'
|
|
||||||
RES[RES$CO_ADDR=='3737 Old Iowa Park Road Wichita Falls Texas USA 76306',"STATE"] <- 'TX'
|
|
||||||
RES[RES$CO_ADDR=='3928 Bacon Switch Rd. Iowa Park Texas USA 76367',"STATE"] <- 'TX'
|
|
||||||
RES[RES$CO_ADDR=='4001 East 149th Street, Suite B Kansas City Missouri USA 64147',"STATE"] <- 'MO'
|
|
||||||
RES[RES$CO_ADDR=='455 Michigan Drive Oakville Ontario CAN L6L 0G4',"STATE"] <-NA
|
|
||||||
RES[RES$CO_ADDR=='5873 FM 369 North Iowa Park Texas USA 76367',"STATE"] <- 'TX'
|
|
||||||
RES[RES$CO_ADDR=='Calle Tijuana No. 16 Col. Baja California El Salto, Jalisco MEX 45692',"STATE"] <-NA
|
|
||||||
RES[RES$CO_ADDR=='Champ de Mass 5 Ave Anatole Paris New York FRA 75007',"STATE"] <- NA
|
|
||||||
RES[RES$CO_ADDR=='Idaho National Laboratory 2525 Fremont Ave Idaho Falls Idaho USA 83145',"STATE"] <- 'ID'
|
|
||||||
RES[RES$CO_ADDR=='Rua Chaves, 510 Jardim California Barueri, Sao Paulo BRA 06409-000',"STATE"] <- NA
|
|
||||||
RES[RES$CO_ADDR=='VIA POLENGHI 5 MONTANASO LOMBARDO ITA 26836',"STATE"] <- NA
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
RES_NON_US <- RES %>% filter(is.na(STATE)) %>% select(-CO_ZIP_CODE)
|
|
||||||
RES <- RES %>% filter(!is.na(STATE),STATE!="OTHER")
|
|
||||||
|
|
||||||
RES$CO_ZIP_CODE <- gsub("-","",substr(word(RES$CO_ADDR,-1),1,5))
|
|
||||||
#One zip failing to match, used USPS lookup of address to change to the proper ZIP
|
|
||||||
RES[RES$CO_ZIP_CODE=='83145',"CO_ZIP_CODE"] <- '83402'
|
|
||||||
|
|
||||||
RES <- RES%>% left_join(cbind(state.abb,state.name) %>% as_tibble %>% rename(STATE=state.abb,STATE_NAME=state.name) )
|
|
||||||
ZIPS <- reverse_zipcode(RES$CO_ZIP_CODE)
|
|
||||||
ZIPS <- ZIPS %>% rename(STATE=state,CO_ZIP_CODE=zipcode)
|
|
||||||
RES <- RES %>% left_join(ZIPS)
|
|
||||||
|
|
||||||
RES$FIPS <- NA
|
|
||||||
for(N in 1:nrow(RES)){
|
|
||||||
try(RES$FIPS[N] <- fips(RES$STATE[N],RES$county[N]))
|
|
||||||
}
|
|
||||||
#Fix two missing codes. It looks like they use special characters
|
|
||||||
RES[RES$STATE=='NM' &RES$county=='Doa Ana County',"FIPS"] <- '35013'
|
|
||||||
# RES[RES$STATE=='LA' &RES$county=='St Mary Parish',"FIPS"] <- '22101'
|
|
||||||
RES <- RES %>% full_join(RES_NON_US)
|
|
||||||
return(RES)
|
|
||||||
}
|
|
||||||
RES <- FIX_STATE(RES)
|
|
||||||
RES$COUNTRY <- NA
|
|
||||||
#A function to identify the if a country abbreviation is in the address. If "WRITE" is TRUE the function updates the "COUNTRY" field to be the matched COUNTRY.
|
|
||||||
FIX_COUNTRY <- function( COUNTRY_ABBR,WRITE=FALSE,DF=RES){
|
|
||||||
#Find the COUNTRY_ABBR somewhere in the middle of the company address
|
|
||||||
MATCH_AT_END <- grepl(toupper(paste0(COUNTRY_ABBR,"$")),word(DF$CO_ADDR,-1))
|
|
||||||
#Find the COUNTRY_ABBR at the end of the company address
|
|
||||||
MATCH_IN_MIDDLE <- grepl(toupper(paste0(" ",COUNTRY_ABBR," ")),DF$CO_ADDR )
|
|
||||||
#Remove records that have already been matched. Keep the remaining matches
|
|
||||||
MATCH <- MATCH_IN_MIDDLE | MATCH_AT_END
|
|
||||||
MATCH <- MATCH & is.na(DF$STATE)& is.na(DF$COUNTRY)
|
|
||||||
if(!WRITE){
|
|
||||||
return(DF[MATCH,"CO_ADDR"] %>% unique )
|
|
||||||
|
|
||||||
} else{
|
|
||||||
DF[MATCH,"COUNTRY"] <- COUNTRY_ABBR
|
|
||||||
return(DF)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
RES$COUNTRY <- ifelse(!is.na(RES$STATE),"USA",NA)
|
|
||||||
|
|
||||||
|
|
||||||
COUNTRY_ABBR <- c("CAN","EGY","KOR","JPN","DEU","CHN","IND","ZAF","ITA","ARG","FRA","ESP","TWN","AUT","SCO","BRA","SWE","TUR","SAU","SGP","PAK","ARE","PHL","BHR","OMN","CHE","ISR","LBN","AUS","MEX","GRC","THA","HUN","BEL","ROU","PJ","QAT","VNM","IRQ","MYS","IRL","POL","SVN","NLD","FIN","IDN","CZE","VEN","COL","CHL","NGA","ECU","GBR","DNK","TTO","UKR","SVK","BGR","KWT","KAZ","EST","GTM","RUS","BGD","UZB","PRT","NZL","TUN","JOR","URY","AZE","PER","NOR","BOL","SUR","HRV","CRI","BRN","LIE","HKG","DOM","PE","PK","SR","WAL","DB","LL")
|
|
||||||
for(x in COUNTRY_ABBR){
|
|
||||||
RES <- FIX_COUNTRY(x,TRUE,DF=RES)
|
|
||||||
}
|
|
||||||
#Correct remaining issues
|
|
||||||
#PE means Peru, which also has PER
|
|
||||||
RES$COUNTRY <- ifelse(RES$COUNTRY=='PE','PER',RES$COUNTRY)
|
|
||||||
RES$COUNTRY <- ifelse(RES$COUNTRY=='PK','PAK',RES$COUNTRY)
|
|
||||||
RES$COUNTRY <- ifelse(RES$COUNTRY=='SR','SUR',RES$COUNTRY)
|
|
||||||
#Wales
|
|
||||||
RES$COUNTRY <- ifelse(RES$COUNTRY=='WAL','GBR',RES$COUNTRY)
|
|
||||||
#All these address in Netherlands
|
|
||||||
RES$COUNTRY <- ifelse(RES$COUNTRY=='DB','NLD',RES$COUNTRY)
|
|
||||||
RES$COUNTRY <- ifelse(RES$COUNTRY=='LL','NLD',RES$COUNTRY)
|
|
||||||
#In Slovakia
|
|
||||||
RES$COUNTRY <- ifelse(RES$CO_ADDR=='Jurská cesta 7 Levice 93401','SVK',RES$COUNTRY)
|
|
||||||
#In Northern Ireland
|
|
||||||
RES$COUNTRY <- ifelse(RES$CO_ADDR=='Lissue Industrial Estate Moira Road Lisburn NOT BT28 2RF','IRL',RES$COUNTRY)
|
|
||||||
#Remove "null" addresses
|
|
||||||
RES <- RES %>% filter(CO_ADDR!="null")
|
|
||||||
|
|
||||||
###MANUAL REVIEW OF MISSED ENTRIED
|
|
||||||
#RES %>% filter(is.na(COUNTRY)) %>% select(MAIN_CERT,CO_ADDR) %>% print(n=100)
|
|
||||||
#Add Year columns
|
|
||||||
|
|
||||||
RES <- RES %>% mutate(ORIG_AUTH_YEAR=year(ORIG_AUTH_DATE),AUTH_YEAR=year(AUTH_DATE),ORIG_AUTH_MONTH=year(ORIG_AUTH_DATE),AUTH_MONTH=year(AUTH_DATE))
|
|
||||||
#Add Nuclear indicator (all license with a nuclear related certificate)
|
|
||||||
RES <- RES %>% mutate(NUCLEAR=CERT_TYPE %in%c("N","N3","NAC","NPT","NS","NUA","NV","OWN","G","GC","MO"))
|
|
||||||
RES
|
|
||||||
|
|
||||||
#RES <- RES %>% filter(CERT_TYPE!="OWN",CERT_TYPE!="NAC")
|
|
||||||
##SAVE
|
|
||||||
saveRDS(RES,"./Data/PROCESSED_DATA/RDS/ASME.Rds")
|
|
||||||
write_csv(RES,"./Data/PROCESSED_DATA/CSV/ASME.csv",quote="all")
|
|
||||||
|
|
||||||
Loading…
x
Reference in New Issue
Block a user