GoekeLab
diff --git a/‎.github/workflows/check-bioc.yml‎
Lines changed: 3 additions & 3 deletions b/‎.github/workflows/check-bioc.yml‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎DESCRIPTION‎
Lines changed: 2 additions & 1 deletion b/‎DESCRIPTION‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎R/bambu-processReads_utilityConstructReadClasses.R‎
Lines changed: 34 additions & 32 deletions b/‎R/bambu-processReads_utilityConstructReadClasses.R‎
Lines changed: 34 additions & 32 deletions
diff --git a/‎R/bambu-quantify_utilityFunctions.R‎
Lines changed: 2 additions & 2 deletions b/‎R/bambu-quantify_utilityFunctions.R‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎R/bambu_utilityFunctions.R‎
Lines changed: 15 additions & 12 deletions b/‎R/bambu_utilityFunctions.R‎
Lines changed: 15 additions & 12 deletions
diff --git a/‎R/sysdata.rda‎
-7 Bytes b/‎R/sysdata.rda‎
-7 Bytes
diff --git a/‎README.md‎
Lines changed: 16 additions & 4 deletions b/‎README.md‎
Lines changed: 16 additions & 4 deletions
diff --git a/‎inst/CITATION‎
Lines changed: 7 additions & 7 deletions b/‎inst/CITATION‎
Lines changed: 7 additions & 7 deletions
diff --git a/‎inst/extdata/seOutputCombined2_SGNex_A549_directRNA_replicate5_run1_chr9_1_1000000.rds‎
12 Bytes b/‎inst/extdata/seOutputCombined2_SGNex_A549_directRNA_replicate5_run1_chr9_1_1000000.rds‎
12 Bytes
diff --git a/‎inst/extdata/seOutputCombinedExtended_SGNex_A549_directRNA_replicate5_run1_chr9_1_1000000.rds‎
0 Bytes b/‎inst/extdata/seOutputCombinedExtended_SGNex_A549_directRNA_replicate5_run1_chr9_1_1000000.rds‎
0 Bytes
@@ -54,9 +54,9 @@ jobs:
       fail-fast: false
       matrix:
         config:
-          - { os: ubuntu-latest, r: '4.3', bioc: '3.17', cont: "bioconductor/bioconductor_docker:RELEASE_3_17", rspm: "https://packagemanager.rstudio.com/cran/__linux__/jammy/latest" }
-          - { os: macOS-latest, r: '4.3', bioc: '3.17'}
-          - { os: windows-latest, r: '4.3', bioc: '3.17'}
+          - { os: ubuntu-latest, r: '4.3', bioc: '3.18', cont: "bioconductor/bioconductor_docker:RELEASE_3_18", rspm: "https://packagemanager.rstudio.com/cran/__linux__/focal/latest" }
+          - { os: macOS-latest, r: '4.3', bioc: '3.18'}
+            ##- { os: windows-latest, r: '4.3', bioc: '3.18'}
           ## Check https://github.com/r-lib/actions/tree/master/examples
           ## for examples using the http-user-agent
     env:
 
@@ -1,7 +1,7 @@
 Package: bambu
 Type: Package
 Title: Context-Aware Transcript Quantification from Long Read RNA-Seq data
-Version: 3.3.2
+Version: 3.5.1
 Authors@R: c(person("Ying", "Chen", role = c("cre","aut"),
              email = "chen_ying@gis.a-star.edu.sg"),
              person("Andre", "Sim", role = "aut",
@@ -57,6 +57,7 @@ biocViews:
     GenomeAnnotation,
     GenomeAssembly,
     ImmunoOncology,
+    LongRead,
     MultipleComparison,
     Normalization, 
     RNASeq, 
 
@@ -246,11 +246,12 @@ constructUnsplicedReadClasses <- function(reads.singleExon, annotations,
     counts = as.data.frame(reads.singleExon) %>% 
         mutate(id = mcols(reads.singleExon)$id) %>% 
         group_by(seqnames,start,end,strand) %>% 
-        summarise(n=n(), id = list(id)) %>% 
+        mutate(n=n(), id = list(id)) %>%  # change summarise to mutate as summarise will reorder the table
+        ungroup() %>%
         as.data.frame()
-    reads.singleExon = unique(reads.singleExon)
     mcols(reads.singleExon)$counts <- counts$n
     mcols(reads.singleExon)$id <- counts$id
+     reads.singleExon = unique(reads.singleExon)
 
     rcUnsplicedAnnotation <- getUnsplicedReadClassByReference(
         granges = reads.singleExon, grangesReference = referenceExons,
@@ -385,10 +386,12 @@ assignGeneIds <-  function(grl, annotations, min.exonOverlap = 10, fusionMode =
                                           fusionMode = fusionMode) 
     #iteratively assign gene ids for stranded granges
     newGeneSet <- is.na(mcols(grl)$GENEID) & strandedRanges
-    referenceGeneSet <- !is.na(mcols(grl)$GENEID) & strandedRanges
-    mcols(grl)$GENEID[newGeneSet] <- assignGeneIdsByReference(grl[newGeneSet], grl[referenceGeneSet],
-                                                                  min.exonOverlap = min.exonOverlap,
-                                                                  fusionMode = FALSE)  # fusion assignment is only done based on original annotations
+    if(sum(newGeneSet != 0)){
+        referenceGeneSet <- !is.na(mcols(grl)$GENEID) & strandedRanges
+        mcols(grl)$GENEID[newGeneSet] <- assignGeneIdsByReference(grl[newGeneSet], grl[referenceGeneSet],
+                                                                    min.exonOverlap = min.exonOverlap,
+                                                                    fusionMode = FALSE)  # fusion assignment is only done based on original annotations
+    }
     chainCount <- 1  # first iteration is outside of while loop
     while(any(!is.na(mcols(grl)$GENEID[newGeneSet])) & chainCount < maxChainIteration) {
       referenceGeneSet <- newGeneSet & !is.na(mcols(grl)$GENEID) & strandedRanges
@@ -440,32 +443,31 @@ assignGeneIdsByReference <- function(grl, annotations, min.exonOverlap = 10,
     uniqueHits <- which(queryHits(ov) %in% which(countQueryHits(ov)==1))
     geneIds[queryHits(ov)[uniqueHits]] <- 
         names(geneRanges)[subjectHits(ov)[uniqueHits]]
-    
-    ## next for non unique hits select one gene (maximum overlap)
-    multiHits <- which(queryHits(ov) %in% which(countQueryHits(ov)>1))
-    expandedRanges <- expandRangesList(ranges(grl[queryHits(ov)[multiHits]]),
-        ranges(geneRanges[subjectHits(ov)[multiHits]]))
-    rangeIntersect <- pintersect(expandedRanges, 
-        mcols(expandedRanges)$matchRng, resolve.empty = 'start.x')
-    intersectById <- tapply(width(rangeIntersect), 
-                            mcols(expandedRanges)$IdMap, sum)
-    
-    filteredMultiHits <- as_tibble(ov[multiHits]) %>% 
-        mutate(intersectWidth = intersectById)
-    if(fusionMode) {
-      filteredMultiHits <- filteredMultiHits %>%  
-        filter(intersectWidth>min.exonOverlap) %>%  
-        mutate(geneid = names(geneRanges)[subjectHits]) %>%  distinct() %>% 
-        group_by(queryHits) %>% summarise(geneid = paste(geneid, collapse=':'))
-      geneIds[filteredMultiHits$queryHits] <- filteredMultiHits$geneid
-      
-    } else {
-    filteredMultiHits <- filteredMultiHits %>% 
-        group_by(queryHits) %>% arrange(desc(intersectWidth)) %>% 
-        dplyr::slice(1)
-    geneIds[filteredMultiHits$queryHits] <- 
-        names(geneRanges)[filteredMultiHits$subjectHits]
-    } 
+    if(length(ov)>0){
+        ## next for non unique hits select one gene (maximum overlap)
+        multiHits <- which(queryHits(ov) %in% which(countQueryHits(ov)>1))
+        rangeIntersect= intersect(ranges(grl[queryHits(ov)[multiHits]]),
+                                    ranges(geneRanges[subjectHits(ov)[multiHits]]))
+        filteredMultiHits =  data.frame(queryHits = queryHits(ov)[multiHits], 
+                                        intersectWidth = sum(width(rangeIntersect)), 
+                                        subjectHits = subjectHits(ov)[multiHits]) %>% 
+            group_by(queryHits) %>% summarise(subjectHits = subjectHits[which.max(intersectWidth)],
+                                                    intersectWidth = max(intersectWidth))
+        if(fusionMode) {
+        filteredMultiHits <- filteredMultiHits %>%  
+            filter(intersectWidth>min.exonOverlap) %>%  
+            mutate(geneid = names(geneRanges)[subjectHits]) %>%  distinct() %>% 
+            group_by(queryHits) %>% summarise(geneid = paste(geneid, collapse=':'))
+        geneIds[filteredMultiHits$queryHits] <- filteredMultiHits$geneid
+        
+        } else {
+        filteredMultiHits <- filteredMultiHits %>% 
+            group_by(queryHits) %>% arrange(desc(intersectWidth)) %>% 
+            dplyr::slice(1)
+        geneIds[filteredMultiHits$queryHits] <- 
+            names(geneRanges)[filteredMultiHits$subjectHits]
+        } 
+    }
     return(geneIds)
 }
 
 
@@ -20,8 +20,8 @@ modifyIncompatibleAssignment <- function(distTable){
 #' Process incompatible counts
 #' @noRd
 processIncompatibleCounts <- function(readClassDist){
-  distTable <- data.table(as.data.frame(metadata(readClassDist)$distTable))[, 
-                                                                            .(readClassId, annotationTxId, readCount, GENEID, dist,equal)]
+  distTable <- unique(data.table(as.data.frame(metadata(readClassDist)$distTable))[, 
+               .(readClassId, annotationTxId, readCount, GENEID, equal)], by = NULL)
   distTableIncompatible <- distTable[grep("unidentified", annotationTxId)]
   # filter out multiple geneIDs mapped to the same readClass using rowData(se)
   geneRCMap <- as.data.table(as.data.frame(rowData(readClassDist)),
 
@@ -170,7 +170,7 @@ checkInputSequence <- function(genomeSequence) {
     },
     error=function(cond) {
         stop("Input genome file not readable.",
-            "Requires a FASTA or BSgenome name")
+            " Requires a FASTA or BSgenome name")
     }
     )}
     return(genomeSequence)
@@ -184,24 +184,27 @@ handleWarnings <- function(readClassList, verbose){
     sampleNames = c()
     for(i in seq_along(readClassList)){
         readClassSe = readClassList[[i]]
-        if (is.character(readClassSe)) 
-            readClassSe <- readRDS(file = readClassSe)
-        warnings[[i]] = metadata(readClassSe)$warnings
+        if (is.character(readClassSe)){
+            readClassSe <- readRDS(file = readClassSe)}
+        warnings[[i]] = NA
+        if(!is.null(metadata(readClassSe)$warnings)){
+            warnings[[i]] = metadata(readClassSe)$warnings}
         sampleNames = c(sampleNames, colnames(readClassList[[i]]))
     }
     names(warnings) = sampleNames
 
-    if(verbose & any(lengths(warnings)>0)){
+    if(verbose & any(!is.na(warnings))){
         message("--- per sample warnings during read class construction ---")
-        for(i in seq_along(warnings)){
-            if(lengths(warnings)[i]>0){
-                message("Warnings for: ", sampleNames[i])
-                sapply(warnings[[i]], message)
-            }
+        warnings.tmp = warnings[!is.na(warnings)]
+        for(i in seq_along(warnings.tmp)){
+            message("Warnings for: ", names(warnings.tmp)[i])
+            sapply(warnings.tmp[[i]], message)
         }
     } else {
-        message("Detected ", sum(lengths(warnings)), " warnings across the samples during ",
-        "read class construction. Access warnings with metadata(bambuOutput)$warnings")
+        warningCount = sum(lengths(warnings[!is.na(warnings)]))
+        if(warningCount > 0){
+            message("Detected ", warningCount, " warnings across the samples during ",
+        "read class construction. Access warnings with metadata(bambuOutput)$warnings")}
     }
     return(warnings)
 }
 
@@ -113,7 +113,7 @@ The bambuAnnotation object can be calculated from:
 
 a) a .gtf file:
 ```rscript
-annotations <- prepareAnnotation(gtf.file)
+annotations <- prepareAnnotations(gtf.file)
 ```
 b) a TxDb object
 ```rscript
@@ -444,7 +444,7 @@ se <- bambu(reads = fusionAligned.bam, annotations = fusionAnnotations, genome =
 |reads|A string or a vector of strings specifying the paths of bam files for genomic alignments, or a BamFile object or a BamFileList object (from Rsamtools).|
 | rcOutDir | A string variable specifying the path to where read class files will be saved. |
 | annotations | A TxDb object, a path to a .gtf file, or a GRangesList object obtained by prepareAnnotations. |
-| genome | A fasta file or a BSGenome object. |
+| genome | A fasta file or a BSGenome object. If a fa.gz is provided, the .fai and .gzi must also be present |
 | stranded | A boolean for strandedness, defaults to FALSE. |
 | ncore | specifying number of cores used when parallel processing is used, defaults to 1. |
 | NDR | specifying the maximum NDR rate to novel transcript output among detected transcripts, defaults to 0.1 |
@@ -498,7 +498,19 @@ rowData(se)
 
 ### Release History
 
-**bambu v3.2.6**
+**bambu v3.2.5**
+
+Release date: 2023-July-07
+
+Minor changes:
+
+- Fix crash when extremely large datasets provided
+- Speed up read class construction 
+- Add LongRead BiocView 
+- Update release history
+
+
+**bambu v3.2.4**
 
 Release date: 2023-Apr-26
 
@@ -592,7 +604,7 @@ Release date: 2020-06-18
 Release date: 2020-05-29 
 
 ### Citation
-Chen, Ying, et al. "Context-Aware Transcript Quantification from Long Read RNA-Seq data with Bambu" bioRxiv (2022). doi: https://doi.org/10.1101/2022.11.14.516358
+Chen, Y., Sim, A., Wan, Y.K. et al. Context-aware transcript quantification from long-read RNA-seq data with Bambu. Nat Methods (2023). https://doi.org/10.1038/s41592-023-01908-w
 
 ### Contributors
 
 
@@ -1,5 +1,5 @@
 citEntry(entry="article",
-         title = "Context-Aware Transcript Quantification from Long Read RNA-Seq data with Bambu",
+         title = "Context-aware transcript quantification from long-read RNA-seq data with Bambu",
          author = personList( as.person("Ying Chen"),
                               as.person("Andre Sim"),
                               as.person("Yuk Kei Wan"),
@@ -8,10 +8,10 @@ citEntry(entry="article",
                               as.person("Min Hao Ling"),
                               as.person("Michael I. Love"),
                               as.person("Jonathan Göke")),
-         year = 2022,
-         journal = "bioRxiv",
-         doi = "https://doi.org/10.1101/2022.11.14.516358",
+         year = 2023,
+         journal = "Nature Methods",
+         doi = "https://doi.org/10.1038/s41592-023-01908-w",
          textVersion = 
-         paste("Chen, Y., Sim, A. D., Wan, Y. K., Yeo, K., Lee, J. J. X., Ling, M. H., ... & Göke, J.", 
-               "Context-Aware Transcript Quantification from Long Read RNA-Seq data with Bambu",
-                "bioRxiv (2022)" ) )
+         paste("Chen, Y., Sim, A., Wan, Y. K., Yeo, K., Lee, J. J. X., Ling, M. H., Love, M. I. & Göke, J.", 
+               "Context-aware transcript quantification from long-read RNA-seq data with Bambu",
+                "Nat Methods (2023)" ) )