diff --git a/assets/dep_lncselect.R b/assets/dep_lncselect.R new file mode 100644 index 0000000..d33e919 --- /dev/null +++ b/assets/dep_lncselect.R @@ -0,0 +1,298 @@ + #!/usr/bin/env Rscript + + # Carregar bibliotecas necessárias + library(tidyverse) + library(rtracklayer) + + # Obter variáveis de ambiente + annotated_gtf <- Sys.getenv("ANNOTATED_GTF") + cpc2_results <- Sys.getenv("CPC2_RESULTS") + plek_results <- Sys.getenv("PLEK_RESULTS") + pfam_domtblout <- Sys.getenv("PFAM_DOMTBLOUT") + reference_gtf <- Sys.getenv("REFERENCE_GTF") + prefix <- Sys.getenv("PREFIX") + min_length <- as.numeric(Sys.getenv("MIN_LENGTH")) + pfam_evalue <- as.numeric(Sys.getenv("PFAM_EVALUE")) + + # Configurar log + log_file <- paste0(prefix, ".log") + sink(log_file, append = TRUE, split = TRUE) + + cat("=== SELEÇÃO DE lncRNAs ===\n") + cat("Iniciando seleção de lncRNAs e combinação com anotação de referência\n") + cat("Data/hora de início:", as.character(Sys.time()), "\n") + cat("\n") + + # Ler arquivos de entrada + cat("Lendo arquivos de entrada...\n") + cat(" GTF anotado:", annotated_gtf, "\n") + cat(" Resultados CPC2:", cpc2_results, "\n") + cat(" Resultados PLEK:", plek_results, "\n") + cat(" Domínios Pfam:", pfam_domtblout, "\n") + cat(" GTF de referência:", reference_gtf, "\n") + cat("\n") + + # Carregar GTF anotado + dup_reference_annot <- as.data.frame(rtracklayer::import(annotated_gtf)) + cat("GTF anotado carregado:", nrow(dup_reference_annot), "linhas\n") + + # Carregar resultados CPC2 + CPC2 <- read.delim(cpc2_results) + colnames(CPC2) <- c("transcript_id", "transcript_length", "peptide_length", "Fickett_score", + "pI", "ORF_integrity", "coding_probability", "CPC2") + cat("Resultados CPC2 carregados:", nrow(CPC2), "transcritos\n") + cat(" Codificantes:", sum(CPC2$CPC2 == "coding"), "\n") + cat(" Não-codificantes:", sum(CPC2$CPC2 == "noncoding"), "\n") + + # Carregar resultados PLEK + PLEK <- read.delim(plek_results, header = FALSE) + PLEK <- PLEK %>% mutate(transcript_id = str_extract(V3, "(?<=\\>).*?(?= loc:)")) + colnames(PLEK) <- c("PLEK", "value", "features", "transcript_id") + cat("Resultados PLEK carregados:", nrow(PLEK), "transcritos\n") + cat(" Codificantes:", sum(PLEK$PLEK == "Coding"), "\n") + cat(" Não-codificantes:", sum(PLEK$PLEK == "Non-coding"), "\n") + + # Carregar domínios Pfam + pfam <- readr::read_table(pfam_domtblout, comment = "#", col_names = FALSE) + cat("Domínios Pfam carregados:", nrow(pfam), "hits\n") + + # Identificar transcritos com domínios significativos + transcripts_with_domain <- as.data.frame(pfam) %>% + dplyr::filter(X7 <= pfam_evalue) %>% + mutate(transcript_id = str_remove(X1, "\\.p[0-9]+$")) %>% + distinct(transcript_id) %>% + mutate(domain_present = "yes") + + cat("Transcritos com domínios Pfam significativos (E-value <=", pfam_evalue, "):", nrow(transcripts_with_domain), "\n") + + # Combinar todos os dados + cat("\nCombinando dados...\n") + gtf_complete <- merge(dup_reference_annot, transcripts_with_domain, by = "transcript_id", all.x = TRUE) + gtf_complete <- merge(gtf_complete, PLEK[,c("transcript_id", "PLEK")], by = "transcript_id", all.x = TRUE) + gtf_complete <- merge(gtf_complete, CPC2[,c("transcript_id", "transcript_length", "peptide_length", "CPC2")], by = "transcript_id", all.x = TRUE) + + cat("GTF completo criado:", nrow(gtf_complete), "linhas\n") + + # Aplicar filtros para seleção de lncRNAs + cat("\nAplicando filtros para seleção de lncRNAs...\n") + cat("Critérios:\n") + cat(" - class_code em: x, i, u\n") + cat(" - Comprimento >= min_length nt\n") + cat(" - PLEK == 'Non-coding' E CPC2 == 'noncoding'\n") + cat(" - Sem domínios Pfam significativos\n") + cat("\n") + + # Contar transcritos em cada etapa do filtro + step1 <- gtf_complete %>% dplyr::filter(class_code %in% c("x", "i", "u")) + cat("Após filtro class_code:", length(unique(step1$transcript_id)), "transcritos únicos\n") + + step2 <- step1 %>% dplyr::filter(transcript_length >= min_length) + cat("Após filtro comprimento:", length(unique(step2$transcript_id)), "transcritos únicos\n") + + step3 <- step2 %>% dplyr::filter(PLEK == "Non-coding" & CPC2 == "noncoding") + cat("Após filtros CPC2 e PLEK:", length(unique(step3$transcript_id)), "transcritos únicos\n") + + step4 <- step3 %>% dplyr::filter(is.na(domain_present)) + cat("Após filtro domínios Pfam:", length(unique(step4$transcript_id)), "transcritos únicos\n") + + # Selecionar lncRNAs finais + lncRNA <- step4 %>% pull(transcript_id) %>% unique() + cat("\nlncRNAs selecionados:", length(lncRNA), "transcritos\n") + + # Criar GTF de lncRNAs + gtf_lncRNA <- gtf_complete %>% + filter(transcript_id %in% lncRNA) %>% + mutate(transcript_biotype = ifelse(type == "transcript", "lncRNA", "")) + + cat("GTF lncRNA criado:", nrow(gtf_lncRNA), "linhas\n") + + # Verificar genes com diferentes class_codes + cat("\nVerificando genes com diferentes class_codes...\n") + lncRNAs_check <- gtf_lncRNA %>% + dplyr::filter(type == "transcript") %>% + dplyr::select(class_code, gene_id) %>% + distinct() + + duplicated_genes <- nrow(lncRNAs_check) - length(unique(gtf_lncRNA$gene_id)) + cat("Genes com diferentes class_codes:", duplicated_genes, "\n") + + # Exportar GTFs intermediários + cat("\nExportando GTFs...\n") + rtracklayer::export(gtf_lncRNA, "gtf_lncRNA.gtf") + rtracklayer::export(gtf_complete, "gtf_complete.gtf") + cat(" gtf_lncRNA.gtf exportado\n") + cat(" gtf_complete.gtf exportado\n") + + # Combinar com anotação de referência + cat("\nCombinando com anotação de referência...\n") + reference <- as.data.frame(rtracklayer::import(reference_gtf)) + reference_filter <- reference %>% filter(type %in% c("transcript", "exon")) + cat("Referência carregada:", nrow(reference_filter), "linhas (transcript + exon)\n") + + combined_stringtie_ncbi <- dplyr::bind_rows(reference_filter, gtf_lncRNA) + + combined_stringtie_ncbi$Dbxref <- sapply(combined_stringtie_ncbi$Dbxref, function(x) { + if (is.null(x)) return(NA_character_) + paste(x, collapse = ",") + }) + + combined_stringtie_ncbi$end_range <- sapply(combined_stringtie_ncbi$end_range, function(x) { + if (is.null(x)) return(NA_character_) + paste(x, collapse = ",") + }) + + combined_stringtie_ncbi$start_range <- sapply(combined_stringtie_ncbi$start_range, function(x) { + if (is.null(x)) return(NA_character_) + paste(x, collapse = ",") + }) + + combined_stringtie_ncbi$Parent <- sapply(combined_stringtie_ncbi$Parent, function(x) { + if (is.null(x)) return(NA_character_) + paste(x, collapse = ",") + }) + + combined_stringtie_ncbi$Note <- sapply(combined_stringtie_ncbi$Note, function(x) { + if (is.null(x)) return(NA_character_) + paste(x, collapse = ",") + }) + + combined_stringtie_ncbi$gene_id[is.na(combined_stringtie_ncbi$gene_id)] <- "unknown_gene" + combined_stringtie_ncbi$transcript_id[is.na(combined_stringtie_ncbi$transcript_id)] <- "unknown_transcript" + combined_stringtie_ncbi$exon_number[is.na(combined_stringtie_ncbi$exon_number)] <- "0" + + # Criar a coluna `attribute` no formato GTF + combined_stringtie_ncbi$attribute <- paste0( + 'gene_id "', combined_stringtie_ncbi$gene_id, '"; ', + 'transcript_id "', combined_stringtie_ncbi$transcript_id, '"; ', + 'exon_number "', combined_stringtie_ncbi$exon_number, '";' + ) + + # Criar o GRanges com os campos obrigatórios + combined_stringtie_ncbi <- GRanges( + seqnames = combined_stringtie_ncbi$seqnames, + ranges = IRanges(start = combined_stringtie_ncbi$start, end = combined_stringtie_ncbi$end), + strand = combined_stringtie_ncbi$strand, + + # metadados (evite nomes reservados) + source = combined_stringtie_ncbi$source, + type = combined_stringtie_ncbi$type, + class_code = combined_stringtie_ncbi$class_code, + transcript_id = combined_stringtie_ncbi$transcript_id, + gene_id = combined_stringtie_ncbi$gene_id, + db_xref = sapply(combined_stringtie_ncbi$Dbxref, paste, collapse = ","), + gbkey = combined_stringtie_ncbi$gbkey, + locus_tag = combined_stringtie_ncbi$locus_tag, + partial = combined_stringtie_ncbi$partial, + orig_protein_id = combined_stringtie_ncbi$orig_protein_id, + orig_transcript_id = combined_stringtie_ncbi$orig_transcript_id, + product = combined_stringtie_ncbi$product, + transcript_biotype = combined_stringtie_ncbi$transcript_biotype, + exon_number = combined_stringtie_ncbi$exon_number, + pseudo = combined_stringtie_ncbi$pseudo, + PLEK = combined_stringtie_ncbi$PLEK, + transcript_length = combined_stringtie_ncbi$transcript_length, + peptide_length = combined_stringtie_ncbi$peptide_length, + CPC2 = combined_stringtie_ncbi$CPC2, + xloc = combined_stringtie_ncbi$xloc, + tss_id = combined_stringtie_ncbi$tss_id, + cmp_ref = combined_stringtie_ncbi$cmp_ref, + transcript_id_stringtie = NA_character_, + gene_id_stringtie = NA_character_, + score = combined_stringtie_ncbi$score, + phase = combined_stringtie_ncbi$phase, + attribute = combined_stringtie_ncbi$attribute + ) + + cat("GTF combinado criado:", nrow(combined_stringtie_ncbi), "linhas\n") + + # Verificar sobreposição + overlap_check <- sum(gtf_lncRNA$transcript_id %in% reference$transcript_id) + cat("Sobreposição transcript_id (esperado 0):", overlap_check, "\n") + + rtracklayer::export(combined_stringtie_ncbi, "combined_stringtie_ncbi.gtf") + cat(" combined_stringtie_ncbi.gtf exportado\n") + + # Renomear IDs + cat("\nRenomeando IDs...\n") + combined_stringtie_ncbi_new_names <- as.data.frame(rtracklayer::import("combined_stringtie_ncbi.gtf")) + + new_names <- combined_stringtie_ncbi_new_names %>% + dplyr::filter(type == "transcript") %>% + mutate(transcript_id_new = ifelse(source == "StringTie", + paste0("Pb18_lncRNA", class_code, gsub("MSTRG.", "_", transcript_id)), + transcript_id)) %>% + mutate(gene_id_new = ifelse(source == "StringTie", + paste0("Pb18_lncRNA", gsub("MSTRG.", "_", gene_id)), + gene_id)) %>% + dplyr::select(transcript_id, transcript_id_new, gene_id, gene_id_new) %>% distinct() + + cat("Novos nomes criados para:", nrow(new_names), "transcritos\n") + + # Aplicar novos nomes + combined_stringtie_ncbi_new_names <- left_join(combined_stringtie_ncbi_new_names, + new_names %>% dplyr::select(transcript_id, transcript_id_new) %>% distinct(), + by = "transcript_id") + + combined_stringtie_ncbi_new_names <- left_join(combined_stringtie_ncbi_new_names, + new_names %>% dplyr::select(gene_id, gene_id_new) %>% distinct(), + by = "gene_id") + + # Reorganizar colunas + combined_stringtie_ncbi_new_names <- combined_stringtie_ncbi_new_names %>% + dplyr::rename(transcript_id_stringtie = transcript_id) %>% + dplyr::rename(gene_id_stringtie = gene_id) %>% + dplyr::rename(transcript_id = transcript_id_new) %>% + dplyr::rename(gene_id = gene_id_new) %>% + dplyr::select(seqnames, start, end, width, strand, + source, type, score, phase, gene_id, + transcript_id, db_xref, gbkey, locus_tag, partial, + orig_protein_id, orig_transcript_id, product, transcript_biotype, exon_number, + pseudo, PLEK, transcript_length, peptide_length, CPC2, + xloc, class_code, tss_id, cmp_ref, + transcript_id_stringtie, gene_id_stringtie) + + + rtracklayer::export(combined_stringtie_ncbi_new_names, "combined_stringtie_ncbi_new_names.gtf") + cat(" combined_stringtie_ncbi_new_names.gtf exportado\n") + + # Estatísticas finais + cat("\n=== ESTATÍSTICAS FINAIS ===\n") + cat("Total de lncRNAs identificados:", length(lncRNA), "\n") + + # Estatísticas por class_code + class_stats <- gtf_lncRNA %>% + filter(type == "transcript") %>% + count(class_code) %>% + arrange(desc(n)) + + cat("Distribuição por class_code:\n") + for(i in 1:nrow(class_stats)) { + cat(" ", class_stats$class_code[i], ":", class_stats$n[i], "\n") + } + + # Estatísticas de comprimento + length_stats <- gtf_lncRNA %>% + filter(type == "transcript") %>% + summarise( + mean_length = mean(transcript_length, na.rm = TRUE), + median_length = median(transcript_length, na.rm = TRUE), + min_length = min(transcript_length, na.rm = TRUE), + max_length = max(transcript_length, na.rm = TRUE) + ) + + cat("Estatísticas de comprimento:\n") + cat(" Média:", round(length_stats$mean_length, 2), "nt\n") + cat(" Mediana:", length_stats$median_length, "nt\n") + cat(" Mínimo:", length_stats$min_length, "nt\n") + cat(" Máximo:", length_stats$max_length, "nt\n") + + cat("\nProcessamento concluído com sucesso!\n") + cat("Data/hora de término:", as.character(Sys.time()), "\n") + sink() + + cat('"', Sys.getenv("TASK_PROCESS", "SELECT_LNCRNAS"), '":\n', file="versions.yml", sep="") + cat(' r-base: "', R.version.string, '"\n', file="versions.yml", append=TRUE, sep="") + cat(' tidyverse: "', as.character(packageVersion("tidyverse")), '"\n', file = "versions.yml", append = TRUE, sep="") + cat(' rtracklayer: "', as.character(packageVersion("rtracklayer")), '"\n', file = "versions.yml", append = TRUE, sep="") + + sink() \ No newline at end of file diff --git a/assets/lncselect.R b/assets/lncselect.R new file mode 100644 index 0000000..2e787cd --- /dev/null +++ b/assets/lncselect.R @@ -0,0 +1,174 @@ +################################################################################ +###################### Select lncRNAs (back to shell) ########################## +################################################################################ +library(tidyverse) +library(dplyr) +library(stringr) + + annotated_gtf <- Sys.getenv("ANNOTATED_GTF") + cpc2_results <- Sys.getenv("CPC2_RESULTS") + plek_results <- Sys.getenv("PLEK_RESULTS") + pfam_domtblout <- Sys.getenv("PFAM_DOMTBLOUT") + reference_gtf <- Sys.getenv("REFERENCE_GTF") + prefix <- Sys.getenv("PREFIX") + min_length <- as.numeric(Sys.getenv("MIN_LENGTH")) + pfam_evalue <- as.numeric(Sys.getenv("PFAM_EVALUE")) + +dup_reference_annot <- as.data.frame(rtracklayer::import(annotated_gtf)) + +# dup_reference_annot <- dup_reference_annot %>% +# mutate( +# gene_id = ifelse( +# str_detect(gene_id, "gene-PADG_"), +# str_extract(gene_id, "gene-PADG_[0-9]+"), +# gene_id # mantém MSTRG.xxx se não tiver gene-PADG +# ) +# ) + +CPC2 <- read.delim(cpc2_results) +colnames(CPC2) <- c("transcript_id", "transcript_length", "peptide_length", "Fickett_score", + "pI", "ORF_integrity", "coding_probability", "CPC2") + +PLEK <- read.delim(plek_results, header = F) +PLEK <- PLEK %>% mutate(transcript_id = str_extract(V3, "(?<=\\>).*?(?= loc:)")) +colnames(PLEK) <- c("PLEK", "value", "features", "transcript_id") + +pfam <- readr::read_table(pfam_domtblout, comment = "#", col_names = FALSE) + + + +# colnames(pfam) <- c( +# "target_name", "target_accession", "tlen", +# "query_name", "query_accession", "qlen", +# "full_seq_Evalue", "full_seq_score", "full_seq_bias", +# "domain_num", "domain_total", +# "domain_c_Evalue", "domain_i_Evalue", "domain_score", "domain_bias", +# "hmm_from", "hmm_to", "ali_from", "ali_to", "env_from", "env_to", +# "acc", "description" +# ) + + +transcripts_with_domain <- as.data.frame(pfam) %>% + dplyr::filter(X7 <= pfam_evalue) %>% + mutate(transcript_id = str_remove(X1, "\\.p[0-9]+$")) %>% # Remove .p1, .p2 etc + distinct(transcript_id) %>% + mutate(domain_present = "yes") + +gtf_complete <- merge(dup_reference_annot, transcripts_with_domain, by = "transcript_id", all.x = T) +gtf_complete <- merge(gtf_complete, PLEK[,c("transcript_id", "PLEK")], by = "transcript_id", all.x = T) +gtf_complete <- merge(gtf_complete, CPC2[,c("transcript_id", "transcript_length", "peptide_length", "CPC2")], by = "transcript_id", all.x = T) + +rm(pfam, CPC2, PLEK, transcripts_with_domain) + +lncRNA <- gtf_complete %>% + dplyr::filter(class_code %in% c("x", "i", "u")) %>% + dplyr::filter(transcript_length >= min_length) %>% + dplyr::filter(CPC2 == "noncoding" & PLEK == "Non-coding") %>% + dplyr::filter(is.na(domain_present)) %>% pull(transcript_id) %>% unique() + +gtf_lncRNA <- gtf_complete %>% filter(transcript_id %in% lncRNA) %>% + mutate(transcript_biotype = ifelse(type == "transcript", "lncRNA", "")) + + +rtracklayer::export(gtf_lncRNA, "gtf_lncRNA.gtf", format = gtf) +rtracklayer::export(gtf_complete, "gtf_complete.gtf", format = gtf) + +rm(dup_reference_annot, gtf_complete, lncRNA) + +################################################################################ +#################### Combine reference gtf and stringtie ####################### +################################################################################ + +## Check gene equal with different class +lncRNAs_check <- gtf_lncRNA %>% + dplyr::filter(type == "transcript") %>% + dplyr::select(class_code, gene_id) %>% + distinct() + +## 15 genes with different class +nrow(lncRNAs_check) - length(unique(gtf_lncRNA$gene_id)) + +## Select genes +cut_lncRNA <- lncRNAs_check %>% dplyr::filter(duplicated(lncRNAs_check$gene_id)) %>% pull(gene_id) + +#gtf_lncRNA %>% dplyr::filter(gene_id %in% cut_lncRNA) %>% +# dplyr::filter(type == "transcript") %>% +# dplyr::select(start, end, strand, gene_id, transcript_id, class_code, xloc, transcript_length) + + + +rm(lncRNAs_check, cut_lncRNA) + +## Filter transcript and exon from reference +reference <- as.data.frame(rtracklayer::import(reference_gtf)) +reference_filter <- reference %>% filter(type %in% c("transcript", "exon")) + +#gtf_lncRNA <- GenomicRanges::makeGRangesFromDataFrame(gtf_lncRNA, keep.extra.columns = TRUE) + +# reference_filter <- makeGRangesFromDataFrame( +# reference_filter, +# seqnames.field = "seqnames", +# start.field = "start", +# end.field = "end", +# strand.field = "strand", +# keep.extra.columns = TRUE +# ) + +combined_stringtie_ncbi <- dplyr::bind_rows(reference_filter, gtf_lncRNA) +# combined_stringtie_ncbi <- c(reference_filter, gtf_lncRNA) + + + +sum(gtf_lncRNA$transcript_id %in% reference$transcript_id) #0 is expected + +# Exportar como novo GTF +rtracklayer::export(combined_stringtie_ncbi, "combined_stringtie_ncbi.gtf", format = gtf) + +#rm(combined_stringtie_ncbi, gtf_lncRNA, reference, reference_filter) + +################################################################################ +################################# Change name ################################## +################################################################################ + +combined_stringtie_ncbi_new_names <- as.data.frame(combined_stringtie_ncbi) + +new_names <- combined_stringtie_ncbi_new_names %>% + dplyr::filter(type == "transcript") %>% + mutate(transcript_id_new = ifelse(source == "StringTie", + paste0("Pb18_lncRNA", class_code, gsub("MSTRG.", "_", transcript_id)), + transcript_id)) %>% + mutate(gene_id_new = ifelse(source == "StringTie", + paste0("Pb18_lncRNA", gsub("MSTRG.", "_", gene_id)), + gene_id)) %>% + dplyr::select(transcript_id, transcript_id_new, gene_id, gene_id_new) %>% distinct() + + +combined_stringtie_ncbi_new_names <- left_join(combined_stringtie_ncbi_new_names, + new_names %>% dplyr::select(transcript_id, transcript_id_new) %>% distinct(), by = "transcript_id") + +combined_stringtie_ncbi_new_names <- left_join(combined_stringtie_ncbi_new_names, + new_names %>% dplyr::select(gene_id, gene_id_new) %>% distinct(), by = "gene_id") + +combined_stringtie_ncbi_new_names <- combined_stringtie_ncbi_new_names %>% + dplyr::rename(transcript_id_stringtie = transcript_id) %>% + dplyr::rename(gene_id_stringtie = gene_id) %>% + dplyr::rename(transcript_id = transcript_id_new) %>% + dplyr::rename(gene_id = gene_id_new) %>% + dplyr::select(seqnames,start, end,width,strand, + source, type,score,phase,gene_id, + transcript_id,db_xref,gbkey,locus_tag,partial, + orig_protein_id,orig_transcript_id,product, transcript_biotype,exon_number, + pseudo,PLEK,transcript_length,peptide_length,CPC2, + xloc,class_code,tss_id,gene_name,cmp_ref, + transcript_id_stringtie,gene_id_stringtie) + +combined_stringtie_ncbi_new_names %>% dplyr::select(gene_id,gene_id_stringtie, + transcript_id, transcript_id_stringtie, class_code) + +rtracklayer::export(combined_stringtie_ncbi_new_names, "combined_stringtie_ncbi_new_names.gtf", format = gtf) + + cat('"', Sys.getenv("TASK_PROCESS", "SELECT_LNCRNAS"), '":\n', file="versions.yml", sep="") + cat(' r-base: "', R.version.string, '"\n', file="versions.yml", append=TRUE, sep="") + cat(' tidyverse: "', as.character(packageVersion("tidyverse")), '"\n', file = "versions.yml", append = TRUE, sep="") + cat(' rtracklayer: "', as.character(packageVersion("rtracklayer")), '"\n', file = "versions.yml", append = TRUE, sep="") + \ No newline at end of file diff --git a/conf/modules.config b/conf/modules.config index 48fa69e..c555822 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -40,7 +40,7 @@ process { } withName: 'STRINGTIE_MERGE' { - ext.args = '-T 1' + ext.args = '-T 1 -m 200' } withName: 'GFFREAD' { diff --git a/modules/local/selectlnc/main.nf b/modules/local/selectlnc/main.nf index 9f5066a..57d26b1 100644 --- a/modules/local/selectlnc/main.nf +++ b/modules/local/selectlnc/main.nf @@ -11,7 +11,6 @@ process SELECT_LNCRNAS { 'community.wave.seqera.io/library/bioconductor-rtracklayer_r-tidyverse:4e377b50350447f9' : 'community.wave.seqera.io/library/bioconductor-rtracklayer_r-tidyverse:4e377b50350447f9' }" - publishDir "${params.outdir}/lncrnas", mode: 'copy' input: @@ -20,13 +19,14 @@ process SELECT_LNCRNAS { tuple val(meta3), path(plek_results) tuple val(meta4), path(pfam_domtblout) path reference_gtf + path lncselect_script // Adicionar o script como input output: tuple val(meta), path("gtf_lncRNA.gtf"), emit: lncrna_gtf tuple val(meta), path("gtf_complete.gtf"), emit: complete_gtf tuple val(meta), path("combined_stringtie_ncbi.gtf"), emit: combined_gtf tuple val(meta), path("combined_stringtie_ncbi_new_names.gtf"), emit: final_gtf - tuple val(meta), path("*.log"), emit: log + //tuple val(meta), path("*.log"), emit: log path "versions.yml", emit: versions when: @@ -39,295 +39,19 @@ process SELECT_LNCRNAS { def pfam_evalue = task.ext.pfam_evalue ?: '1e-10' """ - #!/usr/bin/env Rscript - - # Carregar bibliotecas necessárias - library(tidyverse) - library(rtracklayer) - - # Configurar log - log_file <- "${prefix}.log" - sink(log_file, append = TRUE, split = TRUE) - - cat("=== SELEÇÃO DE lncRNAs ===\\n") - cat("Iniciando seleção de lncRNAs e combinação com anotação de referência\\n") - cat("Data/hora de início:", as.character(Sys.time()), "\\n") - cat("\\n") - - # Ler arquivos de entrada - cat("Lendo arquivos de entrada...\\n") - cat(" GTF anotado: ${annotated_gtf}\\n") - cat(" Resultados CPC2: ${cpc2_results}\\n") - cat(" Resultados PLEK: ${plek_results}\\n") - cat(" Domínios Pfam: ${pfam_domtblout}\\n") - cat(" GTF de referência: ${reference_gtf}\\n") - cat("\\n") - - # Carregar GTF anotado - dup_reference_annot <- as.data.frame(rtracklayer::import("${annotated_gtf}")) - cat("GTF anotado carregado:", nrow(dup_reference_annot), "linhas\\n") - - # Carregar resultados CPC2 - CPC2 <- read.delim("${cpc2_results}") - colnames(CPC2) <- c("transcript_id", "transcript_length", "peptide_length", "Fickett_score", - "pI", "ORF_integrity", "coding_probability", "CPC2") - cat("Resultados CPC2 carregados:", nrow(CPC2), "transcritos\\n") - cat(" Codificantes:", sum(CPC2\$CPC2 == "coding"), "\\n") - cat(" Não-codificantes:", sum(CPC2\$CPC2 == "noncoding"), "\\n") - - # Carregar resultados PLEK - PLEK <- read.delim("${plek_results}", header = FALSE) - PLEK <- PLEK %>% mutate(transcript_id = str_extract(V3, "(?<=\\\\>).*?(?= loc:)")) - colnames(PLEK) <- c("PLEK", "value", "features", "transcript_id") - cat("Resultados PLEK carregados:", nrow(PLEK), "transcritos\\n") - cat(" Codificantes:", sum(PLEK\$PLEK == "Coding"), "\\n") - cat(" Não-codificantes:", sum(PLEK\$PLEK == "Non-coding"), "\\n") - - # Carregar domínios Pfam - pfam <- readr::read_table("${pfam_domtblout}", comment = "#", col_names = FALSE) - cat("Domínios Pfam carregados:", nrow(pfam), "hits\\n") - - # Identificar transcritos com domínios significativos - transcripts_with_domain <- as.data.frame(pfam) %>% - dplyr::filter(X7 <= ${pfam_evalue}) %>% - mutate(transcript_id = str_remove(X1, "\\\\.p[0-9]+\$")) %>% # Remove .p1, .p2 etc - distinct(transcript_id) %>% - mutate(domain_present = "yes") - - cat("Transcritos com domínios Pfam significativos (E-value <= ${pfam_evalue}):", nrow(transcripts_with_domain), "\\n") - - # Combinar todos os dados - cat("\\nCombinando dados...\\n") - gtf_complete <- merge(dup_reference_annot, transcripts_with_domain, by = "transcript_id", all.x = TRUE) - gtf_complete <- merge(gtf_complete, PLEK[,c("transcript_id", "PLEK")], by = "transcript_id", all.x = TRUE) - gtf_complete <- merge(gtf_complete, CPC2[,c("transcript_id", "transcript_length", "peptide_length", "CPC2")], by = "transcript_id", all.x = TRUE) - - cat("GTF completo criado:", nrow(gtf_complete), "linhas\\n") - - # Aplicar filtros para seleção de lncRNAs - cat("\\nAplicando filtros para seleção de lncRNAs...\\n") - cat("Critérios:\\n") - cat(" - class_code em: x, i, u\\n") - cat(" - Comprimento >= ${min_length} nt\\n") - cat(" - PLEK == 'Non-coding' E CPC2 == 'noncoding'\\n") - cat(" - Sem domínios Pfam significativos\\n") - cat("\\n") - - # Contar transcritos em cada etapa do filtro - step1 <- gtf_complete %>% dplyr::filter(class_code %in% c("x", "i", "u")) - cat("Após filtro class_code:", length(unique(step1\$transcript_id)), "transcritos únicos\\n") - - step2 <- step1 %>% dplyr::filter(transcript_length >= ${min_length}) - cat("Após filtro comprimento:", length(unique(step2\$transcript_id)), "transcritos únicos\\n") - - step3 <- step2 %>% dplyr::filter(PLEK == "Non-coding" & CPC2 == "noncoding") - cat("Após filtros CPC2 e PLEK:", length(unique(step3\$transcript_id)), "transcritos únicos\\n") - - step4 <- step3 %>% dplyr::filter(is.na(domain_present)) - cat("Após filtro domínios Pfam:", length(unique(step4\$transcript_id)), "transcritos únicos\\n") - - # Selecionar lncRNAs finais - lncRNA <- step4 %>% pull(transcript_id) %>% unique() - cat("\\nlncRNAs selecionados:", length(lncRNA), "transcritos\\n") - - # Criar GTF de lncRNAs - gtf_lncRNA <- gtf_complete %>% - filter(transcript_id %in% lncRNA) %>% - mutate(transcript_biotype = ifelse(type == "transcript", "lncRNA", "")) - - cat("GTF lncRNA criado:", nrow(gtf_lncRNA), "linhas\\n") - - # Verificar genes com diferentes class_codes - cat("\\nVerificando genes com diferentes class_codes...\\n") - lncRNAs_check <- gtf_lncRNA %>% - dplyr::filter(type == "transcript") %>% - dplyr::select(class_code, gene_id) %>% - distinct() - - duplicated_genes <- nrow(lncRNAs_check) - length(unique(gtf_lncRNA\$gene_id)) - cat("Genes com diferentes class_codes:", duplicated_genes, "\\n") - - # Exportar GTFs intermediários - cat("\\nExportando GTFs...\\n") - rtracklayer::export(gtf_lncRNA, "gtf_lncRNA.gtf") - rtracklayer::export(gtf_complete, "gtf_complete.gtf") - cat(" gtf_lncRNA.gtf exportado\\n") - cat(" gtf_complete.gtf exportado\\n") - - # Combinar com anotação de referência - cat("\\nCombinando com anotação de referência...\\n") - reference <- as.data.frame(rtracklayer::import("${reference_gtf}")) - reference_filter <- reference %>% filter(type %in% c("transcript", "exon")) - cat("Referência carregada:", nrow(reference_filter), "linhas (transcript + exon)\\n") - - combined_stringtie_ncbi <- dplyr::bind_rows(reference_filter, gtf_lncRNA) - - combined_stringtie_ncbi\$Dbxref <- sapply(combined_stringtie_ncbi\$Dbxref, function(x) { - if (is.null(x)) return(NA_character_) - paste(x, collapse = ",") - }) - - combined_stringtie_ncbi\$end_range <- sapply(combined_stringtie_ncbi\$end_range, function(x) { - if (is.null(x)) return(NA_character_) - paste(x, collapse = ",") - }) - - combined_stringtie_ncbi\$start_range <- sapply(combined_stringtie_ncbi\$start_range, function(x) { - if (is.null(x)) return(NA_character_) - paste(x, collapse = ",") - }) - - combined_stringtie_ncbi\$Parent <- sapply(combined_stringtie_ncbi\$Parent, function(x) { - if (is.null(x)) return(NA_character_) - paste(x, collapse = ",") - }) - - combined_stringtie_ncbi\$Note <- sapply(combined_stringtie_ncbi\$Note, function(x) { - if (is.null(x)) return(NA_character_) - paste(x, collapse = ",") - }) - - combined_stringtie_ncbi\$gene_id[is.na(combined_stringtie_ncbi\$gene_id)] <- "unknown_gene" - combined_stringtie_ncbi\$transcript_id[is.na(combined_stringtie_ncbi\$transcript_id)] <- "unknown_transcript" - combined_stringtie_ncbi\$exon_number[is.na(combined_stringtie_ncbi\$exon_number)] <- "0" - - # Criar a coluna `attribute` no formato GTF - combined_stringtie_ncbi\$attribute <- paste0( - 'gene_id "', combined_stringtie_ncbi\$gene_id, '"; ', - 'transcript_id "', combined_stringtie_ncbi\$transcript_id, '"; ', - 'exon_number "', combined_stringtie_ncbi\$exon_number, '";' - ) - - # Criar o GRanges com os campos obrigatórios - combined_stringtie_ncbi <- GRanges( - seqnames = combined_stringtie_ncbi\$seqnames, - ranges = IRanges(start = combined_stringtie_ncbi\$start, end = combined_stringtie_ncbi\$end), - strand = combined_stringtie_ncbi\$strand, - - # metadados (evite nomes reservados) - source = combined_stringtie_ncbi\$source, - type = combined_stringtie_ncbi\$type, - class_code = combined_stringtie_ncbi\$class_code, - transcript_id = combined_stringtie_ncbi\$transcript_id, - gene_id = combined_stringtie_ncbi\$gene_id, - db_xref = sapply(combined_stringtie_ncbi\$Dbxref, paste, collapse = ","), - gbkey = combined_stringtie_ncbi\$gbkey, - locus_tag = combined_stringtie_ncbi\$locus_tag, - partial = combined_stringtie_ncbi\$partial, - orig_protein_id = combined_stringtie_ncbi\$orig_protein_id, - orig_transcript_id = combined_stringtie_ncbi\$orig_transcript_id, - product = combined_stringtie_ncbi\$product, - transcript_biotype = combined_stringtie_ncbi\$transcript_biotype, - exon_number = combined_stringtie_ncbi\$exon_number, - pseudo = combined_stringtie_ncbi\$pseudo, - PLEK = combined_stringtie_ncbi\$PLEK, - transcript_length = combined_stringtie_ncbi\$transcript_length, - peptide_length = combined_stringtie_ncbi\$peptide_length, - CPC2 = combined_stringtie_ncbi\$CPC2, - xloc = combined_stringtie_ncbi\$xloc, - tss_id = combined_stringtie_ncbi\$tss_id, - cmp_ref = combined_stringtie_ncbi\$cmp_ref, - transcript_id_stringtie = NA_character_, - gene_id_stringtie = NA_character_, - score = combined_stringtie_ncbi\$score, - phase = combined_stringtie_ncbi\$phase, - attribute = combined_stringtie_ncbi\$attribute - ) - - cat("GTF combinado criado:", nrow(combined_stringtie_ncbi), "linhas\n") - - # Verificar sobreposição - overlap_check <- sum(gtf_lncRNA\$transcript_id %in% reference\$transcript_id) - cat("Sobreposição transcript_id (esperado 0):", overlap_check, "\n") - - rtracklayer::export(combined_stringtie_ncbi, "combined_stringtie_ncbi.gtf") - cat(" combined_stringtie_ncbi.gtf exportado\n") - - # Renomear IDs - cat("\nRenomeando IDs...\n") - combined_stringtie_ncbi_new_names <- as.data.frame(rtracklayer::import("combined_stringtie_ncbi.gtf")) - - new_names <- combined_stringtie_ncbi_new_names %>% - dplyr::filter(type == "transcript") %>% - mutate(transcript_id_new = ifelse(source == "StringTie", - paste0("Pb18_lncRNA", class_code, gsub("MSTRG.", "_", transcript_id)), - transcript_id)) %>% - mutate(gene_id_new = ifelse(source == "StringTie", - paste0("Pb18_lncRNA", gsub("MSTRG.", "_", gene_id)), - gene_id)) %>% - dplyr::select(transcript_id, transcript_id_new, gene_id, gene_id_new) %>% distinct() - - cat("Novos nomes criados para:", nrow(new_names), "transcritos\n") - - # Aplicar novos nomes - combined_stringtie_ncbi_new_names <- left_join(combined_stringtie_ncbi_new_names, - new_names %>% dplyr::select(transcript_id, transcript_id_new) %>% distinct(), - by = "transcript_id") - - combined_stringtie_ncbi_new_names <- left_join(combined_stringtie_ncbi_new_names, - new_names %>% dplyr::select(gene_id, gene_id_new) %>% distinct(), - by = "gene_id") - - # Reorganizar colunas - combined_stringtie_ncbi_new_names <- combined_stringtie_ncbi_new_names %>% - dplyr::rename(transcript_id_stringtie = transcript_id) %>% - dplyr::rename(gene_id_stringtie = gene_id) %>% - dplyr::rename(transcript_id = transcript_id_new) %>% - dplyr::rename(gene_id = gene_id_new) %>% - dplyr::select(seqnames, start, end, width, strand, - source, type, score, phase, gene_id, - transcript_id, db_xref, gbkey, locus_tag, partial, - orig_protein_id, orig_transcript_id, product, transcript_biotype, exon_number, - pseudo, PLEK, transcript_length, peptide_length, CPC2, - xloc, class_code, tss_id, cmp_ref, - transcript_id_stringtie, gene_id_stringtie) - - - rtracklayer::export(combined_stringtie_ncbi_new_names, "combined_stringtie_ncbi_new_names.gtf") - cat(" combined_stringtie_ncbi_new_names.gtf exportado\\n") - - # Estatísticas finais - cat("\\n=== ESTATÍSTICAS FINAIS ===\\n") - cat("Total de lncRNAs identificados:", length(lncRNA), "\\n") - - # Estatísticas por class_code - class_stats <- gtf_lncRNA %>% - filter(type == "transcript") %>% - count(class_code) %>% - arrange(desc(n)) - - cat("Distribuição por class_code:\\n") - for(i in 1:nrow(class_stats)) { - cat(" ", class_stats\$class_code[i], ":", class_stats\$n[i], "\\n") - } - - # Estatísticas de comprimento - length_stats <- gtf_lncRNA %>% - filter(type == "transcript") %>% - summarise( - mean_length = mean(transcript_length, na.rm = TRUE), - median_length = median(transcript_length, na.rm = TRUE), - min_length = min(transcript_length, na.rm = TRUE), - max_length = max(transcript_length, na.rm = TRUE) - ) - - cat("Estatísticas de comprimento:\\n") - cat(" Média:", round(length_stats\$mean_length, 2), "nt\\n") - cat(" Mediana:", length_stats\$median_length, "nt\\n") - cat(" Mínimo:", length_stats\$min_length, "nt\\n") - cat(" Máximo:", length_stats\$max_length, "nt\\n") - - cat("\\nProcessamento concluído com sucesso!\\n") - cat("Data/hora de término:", as.character(Sys.time()), "\\n") - sink() - - # Criar arquivo de versões - cat('"NFCORE_NOTTOCODE:NOTTOCODE:SELECT_LNCRNAS":\n', file="versions.yml") - cat(' r-base: "', R.version.string, '"\n', file="versions.yml", append=TRUE) - cat(' tidyverse: "', as.character(packageVersion("tidyverse")), '"\n', file = "versions.yml", append = TRUE) - cat(' rtracklayer: "', as.character(packageVersion("rtracklayer")), '"\n', file = "versions.yml", append = TRUE) + # Executar o script R com as variáveis de ambiente + export ANNOTATED_GTF="${annotated_gtf}" + export CPC2_RESULTS="${cpc2_results}" + export PLEK_RESULTS="${plek_results}" + export PFAM_DOMTBLOUT="${pfam_domtblout}" + export REFERENCE_GTF="${reference_gtf}" + export PREFIX="${prefix}" + export MIN_LENGTH="${min_length}" + export PFAM_EVALUE="${pfam_evalue}" + + Rscript ${lncselect_script} """ - + stub: def prefix = task.ext.prefix ?: "${meta.id}" """ diff --git a/nextflow.config b/nextflow.config index bacb242..75198a1 100644 --- a/nextflow.config +++ b/nextflow.config @@ -51,6 +51,7 @@ params { mstrg_prep_script = "${projectDir}/assets/mstrg_prep.pl" plek = "${projectDir}/assets/PLEK.1.2.tar.xz" cpc2 = "${projectDir}/assets/CPC2_standalone" + lncselect = "${projectDir}/assets/lncselect.R" // Pipeline specific options diff --git a/workflows/nottocode.nf b/workflows/nottocode.nf index 3bca03a..ac9f27b 100644 --- a/workflows/nottocode.nf +++ b/workflows/nottocode.nf @@ -50,8 +50,12 @@ workflow NOTTOCODE { error "ERROR: missing --reference_genome fasta" } + if (!params.reference_gff) { + error "ERROR: missing --reference_gff annotation file" + } + if (!params.reference_gtf) { - error "ERROR: missing --reference_gtf annotation file" + error "ERROR: missing --reference_gtf annotation file" } if (!params.pfam_db) { @@ -101,6 +105,7 @@ workflow NOTTOCODE { // Refence files // ch_genome_fasta = file(params.reference_genome) + ch_reference_gff = file(params.reference_gff) ch_reference_gtf = file(params.reference_gtf) // @@ -142,7 +147,7 @@ workflow NOTTOCODE { STRINGTIE_MERGE ( ch_gtf_files, - ch_reference_gtf + ch_reference_gff ) ch_versions = ch_versions.mix(STRINGTIE_MERGE.out.versions) @@ -162,7 +167,7 @@ workflow NOTTOCODE { GFFCOMPARE ( ch_merged_gtf, - ch_reference_gtf + ch_reference_gff ) ch_versions = ch_versions.mix(GFFCOMPARE.out.versions) @@ -282,12 +287,16 @@ workflow NOTTOCODE { ) ch_versions = ch_versions.mix(HMMER_HMMSEARCH.out.versions) + + + lncselect_script_ch = Channel.fromPath(params.lncselect) SELECT_LNCRNAS ( - MSTRG_PREP.out.gtf_prep, + GFFCOMPARE.out.annotated_gtf, CPC2.out.cpc2_results, PLEK.out.plek_results, HMMER_HMMSEARCH.out.domain_summary, - ch_reference_gtf + ch_reference_gtf, + lncselect_script_ch ) ch_versions = ch_versions.mix(SELECT_LNCRNAS.out.versions)