R-uniquevalues-scriptΒΆ
This R script was created to visualize the unique values and duplicated rows in the database:
pdf("rplot18.pdf")
library(readr) # for read_delim()
library(dplyr) # for distinct()
library(ggplot2)
library(RColorBrewer)
library(doMC)
registerDoMC(cores = 2)
iref15_path = "/dataext/irdata18"
mitab_file_name = "mitab_all"
#iref15_path = "/data/irdata15A/" mitab_file_name = "All.mitab.12-11-2017.txt"
setwd(iref15_path)
#####################################################################
# Load the MITAB data
iref <- read_delim( file.path(iref15_path, mitab_file_name), "\t",
escape_double = FALSE, col_names = FALSE, trim_ws = TRUE)
mitab_header =
c("ID_Interactor_A",
"ID_Interactor_B",
"Alt_IDs_Interactor_A",
"Alt_IDs_Interactor_B",
"Aliases_Interactor_A",
"Aliases_Interactor_B",
"Interaction_Detection_Method",
"Publication_1st_Author",
"Publication_Identifiers",
"Taxid_Interactor_A",
"Taxid_Interactor_B",
"Interaction_Types",
"Source_Database",
"Interaction_Identifiers",
"Confidence_Values",
"Expansion",
"Biological_role_A",
"Biological_role_B",
"Experimental_role_A",
"Experimental_role_B",
"Interactor_type_A",
"Interactor_type_B",
"Xref_for_interactor_A",
"Xref_for_interactor_B",
"Xref_for_the_interaction",
"Annotations_for_interactor_A",
"Annotations_for_interactor_B",
"Annotations_for_the_interaction",
"NCBI_Taxonomy_identifier_for_the_host_organism",
"Parameters_of_the_interaction",
"Creation_date",
"Update_date",
"Checksum_for_interactor_A",
"Checksum_for_interactor_B",
"Checksum_for_interaction",
"negative",
"OriginalReferenceA",
"OriginalReferenceB",
"FinalReferenceA",
"FinalReferenceB")
colnames(iref) = mitab_header
nrow(iref)
## finding duplicate rows
mitab=as_tibble(iref[,1:14])
duplicated_rows=duplicated(mitab)
duplicated_iref=mitab[duplicated_rows,]
table_count <- table(duplicated_iref[13])
barplot_data<- data.frame(name=names(table_count),value=table_count)
ggplot(barplot_data,aes(x=name,y=value.Freq)) +
geom_bar(stat="identity") +
scale_fill_brewer(palette = "Set1") +
theme(legend.position="none")+
coord_flip()+
ggtitle("amount of duplicated")
#!!too intensive
##all unique values of columns
unique_columns=sapply(mitab,n_distinct)
barplot_data<- data.frame(name=mitab_header[1:14],value=unique_columns)
ggplot(barplot_data,aes(x=name,y=value)) +
geom_bar(stat="identity") +
scale_fill_brewer(palette = "Set1") +
theme(legend.position="none")+
coord_flip()+
ggtitle("amount of unique values in all columns")
## make list of columns with a low amount of unique values
table_count <- table(mitab[12])
barplot_data<- data.frame(name=names(table_count),value=table_count)
ggplot(barplot_data,aes(x=name,y=value.Freq)) +
geom_bar(stat="identity") +
scale_fill_brewer(palette = "Set1") +
theme(legend.position="none")+
coord_flip()+
ggtitle(mitab_header[12])
table_count <- table(mitab[13])
barplot_data<- data.frame(name=names(table_count),value=table_count)
ggplot(barplot_data,aes(x=name,y=value.Freq)) +
geom_bar(stat="identity") +
scale_fill_brewer(palette = "Set1") +
theme(legend.position="none")+
coord_flip()+
ggtitle(mitab_header[13])
dev.off()