Compare revisions

Jean Mainguy · Jean Mainguy · Jean Mainguy · Jean Mainguy · Mainguy Jean · 795139df
--- a/README.md
+++ b/README.md
--- a/bash_scripts/cutadapt_16S23S.sh
+++ b/bash_scripts/cutadapt_16S23S.sh
+#!/bin/bash
+#SBATCH -J cutadapt
+#SBATCH -c 20
+#SBATCH --mem=5GB
+#SBATCH -t 03-00 #Acceptable time formats include "minutes", "minutes:seconds", "hours:minutes:seconds", "days-hours", "days-hours:minutes" and "days-hours:minutes:seconds".
+#SBATCH --output=slurm-%x.%j.out
+#SBATCH -p workq
+
+
+
+# 16S primers
+# five_prime_primer=AGRGTTYGATYMTGGCTCAG
+# three_prim_primer=AAGTCGTAACAAGGTARCY
+
+# 16S-23S primers
+five_prime_primer=AGRGTTTGATYHTGGCTCAG
+three_prim_primer=AGTTTDACTGGGGYGGT
+
+# ACCRCCCCAGTHAAACT # AGTTTDACTGGGGYGGT 
+
+
+
+
+outdir=fastq_no_primers
+
+mkdir -p $outdir/tmp
+
+t=20
+
+for fastq_file in ../fastq/*fastq.gz; do 
+
+    echo "======================================="
+
+    fastq_file_name=`basename $fastq_file`
+    outfile_5prim="$outdir/tmp/${fastq_file_name%.fastq.gz}_cutadapt_no5prim.fastq.gz"
+    outfile_final="$outdir/${fastq_file_name%.fastq.gz}.fastq.gz"
+
+    # REMOVE PACBIO Primers 
+
+
+    primer_length=${#five_prime_primer}
+    overlap="$((primer_length-1))"
+    cutadapt -g $five_prime_primer --error-rate 0.1 --discard-untrimmed --match-read-wildcards --revcomp -j $t --overlap $overlap -o $outfile_5prim $fastq_file > ${outfile_5prim%.fastq.gz}.log ; head -n40 ${outfile_5prim%.fastq.gz}.log
+
+    primer_length=${#three_prim_primer}
+    overlap="$((primer_length-1))"
+    cutadapt -a $three_prim_primer --error-rate 0.1 --discard-untrimmed --match-read-wildcards -j $t --overlap $overlap -o $outfile_final $outfile_5prim > ${outfile_final%.fastq.gz}.log;  head -n40  ${outfile_final%.fastq.gz}.log
+
+
+done
+
--- a/bash_scripts/cutadapt_16S_primers.sh
+++ b/bash_scripts/cutadapt_16S_primers.sh
@@ -36,7 +36,7 @@ for fastq_file in ../fastq/*fastq.gz; do

    primer_length=${#three_prim_primer}
    overlap="$((primer_length-1))"
-    cutadapt -a $three_prim_primer --error-rate 0.1 --discard-untrimmed --match-read-wildcards --revcomp -j $t --overlap $overlap -o $outfile_final $outfile_5prim > ${outfile_final%.fastq.gz}.log;  head -n40  ${outfile_final%.fastq.gz}.log
+    cutadapt -a $three_prim_primer --error-rate 0.1 --discard-untrimmed --match-read-wildcards -j $t --overlap $overlap -o $outfile_final $outfile_5prim > ${outfile_final%.fastq.gz}.log;  head -n40  ${outfile_final%.fastq.gz}.log


 done

--- a/polymerase_test/plasmide_Pbr322_analysis/plot_error_distribution.ipynb
+++ b/polymerase_test/plasmide_Pbr322_analysis/plot_error_distribution.ipynb
--- a/polymerase_test/plasmide_Pbr322_analysis/readme.md
+++ b/polymerase_test/plasmide_Pbr322_analysis/readme.md
+# getting the fastq data 
+mkdir fastq
+cp -P  /work2/project/seqoccin/metaG/metabarcoding/polymerase_test/fastq/*Pbr322* fastq/
+
+# fastqc
+
+sbatch fastqc.sbatch
+
+
+# download ref sequence of the plasmide
+
+## from ncbi web site   
+ncbi page: https://www.ncbi.nlm.nih.gov/nuccore/J01749.1?report=fasta
+
+Download manually the sequence in plasmide_Pbr322_ncbi.fasta 
+## from NEB web site 
+
+https://international.neb.com/-/media/nebus/page-images/tools-and-resources/interactive-tools/dna-sequences-and-maps/text-documents/pbr322fsa.txt?rev=b1ef8762bbe9431886127f019c09df5b&hash=355F5D1EE26BAC8271AA68193BF15F36
+
+
+download manually in plasmide_Pbr322_neb.fasta
+
+
+## seq comparison
+1 nt difference between the 2 seq 
+
+https://www.ebi.ac.uk/Tools/services/web/toolresult.ebi?jobId=emboss_needle-I20210601-152410-0083-31480283-p2m
+
+```
+pBR322          1101 CGCTCGCGGCTCTTACCAGCCTAACTTCGATCATTGGACCGCTGATCGTC   1150
+                     |||||||||||||||||||||||||||||||||.||||||||||||||||
+J01749.1        1101 CGCTCGCGGCTCTTACCAGCCTAACTTCGATCACTGGACCGCTGATCGTC   1150
+```
+
+# cleaning the reads 
+Pacbio adaptateur have not been removed by pacbio postprocess.
+
+
+
+Primer + adaptateur
+fwd :GCAGTCGAACATGTAGCTGACTCAGGTCACAGCTTTAATGCGGTAGTTTATC
+
+rev : TGGATCACTTGTGCAAGCATCACATCGTAGATCGATGATAAGCTGTCAAAC
+
+
+adaptateur Pacbio:
+
+fwd GCAGTCGAACATGTAGCTGACTCAGGTCAC
+
+rev : TGGATCACTTGTGCAAGCATCACATCGTAG
+
+
+
+
+
+## using frogs preprocess tool
+
+let's remove them with the primer sequences using preprocess of frogs. 
+
+simple test on the first 200 reads 
+
+```
+
+zcat ../fastq/Accustart-Pbr322-PCR1-1.fastq.gz | head -n800 > h800_Accustart-Pbr322-PCR1-1.fastq
+
+five_prime_primer=GCAGTCGAACATGTAGCTGACTCAGGTCACAGCTTTAATGCGGTAGTTTATC
+#three_prim_primer=TGGATCACTTGTGCAAGCATCACATCGTAGATCGATGATAAGCTGTCAAAC  need to be reverse complement to work with frogs
+three_prim_primer=GTTTGACAGCTTATCATCGATCTACGATGTGATGCTTGCACAAGTGATCCA
+
+preprocess.py pacbio-hifi --min-amplicon-size 4000  --max-amplicon-size 4500                        --five-prim-primer $five_prime_primer --three-prim-primer $three_prim_primer -p 2                           --input-reads h800_Accustart-Pbr322-PCR1-1.fastq 
+```
+
+
+preprocess on all samples 
+
+
+sort preprocess_counts.tsv file to see the more abundant sequence
+
+sort -k2 -rn preprocess_counts.tsv | head
+
+### launch frogs on the plasmides 
+
+in dir frogs_pipeline
+run frogs until otu filters (no affiliation.. )
+
+
+
+
+# Aligning the reads to the plasmid 
+
+
+In mapping_reads_to_plasmide/test
+
+```
+module load bioinfo/samtools-1.11
+module load bioinfo/minimap2-2.11
+
+reads="/work2/project/seqoccin/metaG/metabarcoding/polymerase_test/plasmide_Pbr322_analysis/fastq/Accustart-Pbr322-PCR1-1.fastq.gz"
+ref='/work2/project/seqoccin/metaG/metabarcoding/polymerase_test/plasmide_Pbr322_analysis/plasmide_ref_sequence/plasmide_Pbr322_ncbi.fasta'
+
+read_name=`basename ${reads%.fastq.gz}`
+ref_name=`basename ${ref%.fasta}`
+
+output_basename=${read_name}_to_${ref_name}
+echo $output_basename
+
+minimap2 -ax asm20 $ref $reads | samtools sort -o ${output_basename}.bam
+
+
+samtools index ${read_name}_to_${ref_name}.bam
+samtools flagstat ${read_name}_to_${ref_name}.bam > ${read_name}_to_${ref_name}.flagstat
+
+samtools coverage  ${read_name}_to_${ref_name}.bam > ${read_name}_to_${ref_name}.coverage
+```
+
+In IGV, we can clearly see that a position 1134 a T is present and not a C. --> the correct ref is the one from neb web site ! 
+
+
+## map frogs preprocessed reads to plasmide sequence
+
+in frogs_preprocess/
+
+
+run script align_preprocess_reads_to_plasmide_seq.sbatch 
+
+
+## map frogs preprocessed reads to expected amplicon of the plasmide
+
+instead f takinf the whole plasmide sequence it will be easier to map the read only on the expected amplicon sequence (subsequence of the plasmide). 
+It will much easier to compute error rate at each position.
+
+run script align_preprocess_reads_to_expected_amplicon_seq.sbatch
+
+
+
+# Analyse amplicon error
+
+type of analysis already performed here : /work2/project/seqoccin/metaG/metabarcoding/16S-23S_27_10_2020/ADNmockbact/frogs_results/preprocess_affiliation/map_amplicon_to_genomes 
+
+It has also been made here along dada2 analysis of ccs data : https://benjjneb.github.io/LRASManuscript/LRASms_Zymo.html 
+
+Filter bam to have a smaller dataset to work with
+
+```
+module load bioinfo/samtools-1.11
+conda activate jupy
+cp ~/metaG/axis-3-whole-metagenome/A3P2_samples/scripts/filter_samfile_by_read_list.py .
+# modification of filter_samfile_by_read_list.py to parse read name by removing the size info to be able to identify them.. 
+
+samtools view preprocess_reads_to_expected_seq.bam | python filter_samfile_by_read_list.py  -r best_2k_preprocess_reads.list -v > best_2k_preprocess_reads_to_expected_seq.sam
+
+```
+# Get error rate from shotgun 
+
+In dir shotgun_analysis 
+
+cp -P /work2/project/seqoccin/metaG/metabarcoding/polymerase_test/fastq/PBR322-proto-shotgun-ligation.fastq.gz . 
+
+
+module load bioinfo/FastQC_v0.11.7
+module load bioinfo/MultiQC-v1.7
+
+mkdir -p fastqc
+fastqc PBR322-proto-shotgun-ligation.fastq.gz -o fastqc/ 
+multiqc fastqc/
+
+
+map ll
+
+
+
+
+
+
+
+
+
+
+
+
--- a/scripts/add_multiaffi_to_abd_table.py
+++ b/scripts/add_multiaffi_to_abd_table.py
@@ -16,12 +16,18 @@ __email__ = 'support.bioinfo.genotoul@inra.fr'
 __status__ = 'dev'


-from argparse import ArgumentParser, ArgumentDefaultsHelpFormatter, FileType
+from argparse import ArgumentParser, ArgumentDefaultsHelpFormatter
 import logging
 import sys
-import frogs_analysis_fct 
+from frogs_analysis_fct import process_frogs_affiliation, get_corresponding_species_mock, clean_mock_sp_relation


+def load_mock_taxonomies(mock_taxonomies_file):
+    
+    with open(mock_taxonomies_file) as fl:
+        return {l.rstrip().split("\t")[1]:l.split("\t")[0] for l in fl if l.rstrip()}
+    
+
 def parse_arguments():
    """Parse script arguments."""
    parser = ArgumentParser(description="...",
@@ -41,6 +47,12 @@ def parse_arguments():

    parser.add_argument('--taxonomic_ranks', default='Domain Phylum Class Order Family Genus Species Species', help='Taxonomic ranks of the affiliation')

+    parser.add_argument('--mock_taxonomies', default=None, help='Specify this argument if the sample is from mock communities. '
+                        'It allows clusters to be linked to their corresponding mock species. '
+                        'Provide a tabulated file with two columns: the first column containing the name of the mock species, '
+                        'and the second column containing the taxonomy of this species based on the affiliation database used. '
+                        'The taxonomy should be represented from Phylum to Species, separated by semicolons.')
+
    parser.add_argument('-o', '--output', default='affi_tables_merged.tsv', help='output table name')

    parser.add_argument("-v", "--verbose", help="increase output verbosity",
@@ -70,9 +82,10 @@ def main():

    region  = args.region
    affi_db_name = args.affi_db_name
+    mock_taxonomies_file = args.mock_taxonomies


-    analysis_df =  frogs_analysis_fct.process_frogs_affiliation(affi_abundance_fl, multihit_fl, taxonomic_ranks,
+    analysis_df =  process_frogs_affiliation(affi_abundance_fl, multihit_fl, taxonomic_ranks,
                                                                min_ids = [min_identity], min_covs = [min_coverage])

    df_valid_affi = analysis_df.loc[analysis_df[f'id>{min_identity}_cov>{min_coverage}']]
@@ -89,6 +102,17 @@ def main():
    analysis_df['region'] = region
    analysis_df['db'] = affi_db_name

+    if mock_taxonomies_file:
+        logging.info(f'Linking cluster to their corresponding mock species using {mock_taxonomies_file}')
+        taxonomies2mock_species = load_mock_taxonomies(mock_taxonomies_file)
+
+        analysis_df["sp_mock_taxonomy"] = analysis_df[["blast_taxonomy", "mutliaffiliation"]].apply(
+                get_corresponding_species_mock, args=(taxonomies2mock_species,), axis=1 )
+
+        analysis_df["sp_mock_taxonomy"] = analysis_df["sp_mock_taxonomy"].apply(clean_mock_sp_relation)
+        analysis_df["mock_species"] = analysis_df["sp_mock_taxonomy"].apply(lambda x: x if x not in taxonomies2mock_species else taxonomies2mock_species[x] )
+
+
    logging.info(f'Writting merged table in {args.output}')
    analysis_df.to_csv(args.output, sep='\t', index=False)


--- a/scripts/affiliation_to_newick.py
+++ b/scripts/affiliation_to_newick.py
@@ -64,6 +64,7 @@ def main():

    tax_table = args.affiliation_table
    samples = args.samples
+    valid_col = 'valid_affiliation'

    samples_concat_str = "-".join(samples)

@@ -74,6 +75,10 @@ def main():

    df = pd.read_csv(tax_table, sep="\t")

+    assert all(( type(v) is bool  for v in set(df[valid_col])) ), f'Not all value of {valid_col} column is a boolean. So we cannot filter out weak affiliations for the plot.'
+    #keep only row with a valid affiliation. 
+    df = df.loc[df[valid_col]] 
+
    # Filter samples
    # remove row that have no count in the given samples colonnes

@@ -142,7 +147,9 @@ def main():
    ]
    colors_iter = cycle(colors)

-    phylums = set(df["Phylum"])
+    taxonomies = set(df["blast_taxonomy_cleaned"])
+    phylums = {taxo.split(';')[1] for taxo in taxonomies} # index 1 for phylum}
+
    # write itol TREE_COLORS style
    
    phylum_colors_file = os.path.join(outdir, f"{outdir}_s{samples_concat_str}_colors_styles_phylum.txt")

--- a/scripts/dada2_16S23S_analysis.Rmd
+++ b/scripts/dada2_16S23S_analysis.Rmd
+---
+title: "DADA2 + PacBio HiFi + FROGS format: ZymoBIOMICS Microbial Community Standard"
+output: html_document
+---
+
+<!-- #region -->
+This Rmarkdown is an adaptation of "DADA2 + PacBio: ZymoBIOMICS Microbial Community Standard"  https://benjjneb.github.io/LRASManuscript/LRASms_Zymo.html by Benjamin J Callahan
+
+
+## Data
+
+The mock community being sequenced is the ZymoBIOMICS Microbial Community Standard: https://www.zymoresearch.com/zymobiomics-community-standard . This mock community contains the following bacterial and yeast species (but we'll ignore the yeast as they aren't amplified by this protocol):
+
+* Pseudomonas aeruginosa
+* Escherichia coli
+* Salmonella enterica
+* Lactobacillus fermentum
+* Enterococcus faecalis
+* Staphylococcus aureus
+* Listeria monocytogenes
+* Bacillus subtilis
+
+
+
+## Setup
+
+Load libraries, setup paths, prepare environment:
+<!-- #endregion -->
+```{r init, warning=FALSE, message=FALSE}
+library(dada2);packageVersion("dada2")
+library(biomformat);packageVersion("biomformat")
+#library(Biostrings);packageVersion("Biostrings")
+#library(ShortRead);packageVersion("ShortRead")
+#library(ggplot2);packageVersion("ggplot2")
+#library(reshape2);packageVersion("reshape2")
+#library(RColorBrewer);packageVersion("RColorBrewer")
+```
+Define function to write dada2 result into frogs compatible files:
+
+```{r setup}
+write_result_to_frogs_format <- function(dada_obj, out_dir, sample_names ) {
+
+    seqtab <- makeSequenceTable(dada_obj)
+    row.names(seqtab) <- sample_names
+    asv_seqs <- colnames(seqtab)
+    asv_headers <- paste(">",colnames(seqtab),sep="")
+
+    asv_fasta <- c(rbind(asv_headers, asv_seqs))
+    write(asv_fasta, file.path(out_dir,"clustering.fasta"))
+
+    asv_tab <- t(seqtab)
+
+
+    row.names(asv_tab) <- sub(">", "", asv_headers)
+    write.table(asv_tab, file.path(out_dir,"ASVs_counts.txt"), sep="\t", quote=F)
+
+    st.biom <- make_biom(t(seqtab))
+    write_biom(st.biom, file.path(out_dir,"clustering.biom"))
+
+}
+```
+
+```{r error=TRUE}
+multithread_flag=TRUE
+
+
+path <- "fastq_no_primers"
+
+path.out <- "." 
+
+fns <- sort(list.files(path, pattern="fastq.gz", full.names = TRUE))
+sample.names <- sapply(strsplit(basename(fns), ".fastq.gz"), `[`, 1)
+
+
+print(fns)
+print(sample.names)
+```
+
+## Remove Primers and Filter
+
+Remove primers and orient reads:
+
+This step has already been applied to the data using cutadapt tool. 
+```{r primers, error=TRUE}
+#nops <- file.path(path, "noprimers", basename(fns))
+#prims <- removePrimers(fns, nops, primer.fwd=forward_primer, primer.rev=reverse_primer, orient=TRUE, verbose=TRUE)
+
+nops <- file.path(path, basename(fns))
+print(nops)
+
+```
+More loss than with FROGS... :-/
+
+original statement: Very little loss there even though primer indels aren't allowed currently. 
+
+Inspect length distribution.
+```{r length_distribution}
+lens.fn <- lapply(nops, function(fn) nchar(getSequences(fn)))
+lens <- do.call(c, lens.fn)
+lens <- as.numeric(lens)
+hist(lens, 200)
+```
+
+original statement:  Sharply peaked at the expected length range.
+
+Filter:
+```{r filter}
+filts <- file.path(path.out, "filtered", basename(nops))
+track_filt <- filterAndTrim(nops, filts, minQ=3, minLen=3500, maxLen=5000, maxN=0, rm.phix=FALSE, maxEE=2)
+
+track_filt_df = data.frame(track_filt)
+count_pre_filt = track_filt_df$reads.in
+count_post_filt = track_filt_df$reads.out
+
+track_filt_df$prct_out = 100 *  track_filt_df$reads.out /  track_filt_df$reads.in
+
+track_filt_df
+```
+
+<!-- #region -->
+In the original markdown: 94.9% of sequence are kept after filtering. 
+
+
+## Run DADA2
+
+Dereplicate:
+<!-- #endregion -->
+```{r derep}
+drp <- derepFastq(filts, verbose=TRUE)
+print(drp)
+```
+Learn errors:
+
+```{r learn-err}
+err <- learnErrors(drp, BAND_SIZE=32, multithread=multithread_flag, errorEstimationFunction=dada2:::PacBioErrfun) # 10s of seconds
+```
+
+Inspect errors:
+
+```{r plot-err}
+plotErrors(err)
+```
+
+Apply inflateErr with different inflation value on the error model:
+
+```{r}
+
+listErr = c(err[1]) 
+inflate_values = c('None') 
+
+length(listErr)
+length(inflate_values)
+
+```
+
+Denoise with all the different inflation values and write the result into frogs compatible files:
+```{r dada}
+listDD <- lapply(1:length(listErr), function(x){
+    print(x)
+    
+    dd <- dada(drp, err=listErr[[x]], BAND_SIZE=32, multithread=multithread_flag)
+    cur_val <- inflate_values[x]
+    
+    outdirname = paste("err_inflation", cur_val, sep='_')
+    dir_out <-  file.path(path.out, outdirname)
+    print(dir_out)
+    dir.create(dir_out, showWarnings = FALSE)
+    write_result_to_frogs_format(dd, dir_out,  sample.names)
+    return(dd)
+})
+
+
+
+
+```
+
+```{r read_tracking}
+denoised_sum = c()
+asv_count = c()
+
+
+listSummary <- lapply(1:length(listDD), function(inflate_index){
+    
+    dd = listDD[[inflate_index]]
+    
+    interlign <- lapply(1:length(sample.names), function(sample_index){
+        
+        if (length(sample.names) == 1 ){
+             d_sample = dd
+
+        }
+
+        else{
+            d_sample = dd[[sample_index]]
+        }
+        
+        denoised_sum <- sum(d_sample$denoised)
+        asv_count <- nrow(d_sample$clustering)
+        
+        
+        cur_ligne <- c(samples = sample.names[sample_index],
+                       primers= count_pre_filt[sample_index],
+                       filtered=count_post_filt[sample_index],
+                       denoised=denoised_sum,
+                       asv_count=asv_count,
+                      inflate_value = inflate_values[inflate_index])
+        
+        return(cur_ligne)
+    })
+    
+    sub_table <- do.call(rbind, interlign)
+    
+    
+})
+
+full_table <- do.call(rbind, listSummary)
+
+full_table
+```
+```{r write_read_tracking_tsv}
+write.table(full_table, file.path(path.out, 'summary_stat.tsv'), sep="\t", quote=F, row.names=F)
+```
--- a/scripts/dada2_analysis_simple.Rmd
+++ b/scripts/dada2_analysis_simple.Rmd
--- a/scripts/frogs_analysis_fct.py
+++ b/scripts/frogs_analysis_fct.py
 import pandas as pd
 import logging

+def clean_mock_sp_relation(mock_sp_relation):
+    if len(mock_sp_relation) == 1:
+        return mock_sp_relation.pop()
+    elif len(mock_sp_relation) == 0:
+        return "Other"
+    elif len(mock_sp_relation) > 1:
+        logging.warning(f"A cluster have more than one mock species in its affiliation: \n {mock_sp_relation}")
+        return 'multiple mock species'
+    
+    
+def get_corresponding_species_mock(tax_and_multiaffi, mock_taxonomies):
+    taxonomy_str = tax_and_multiaffi["blast_taxonomy"]
+                
+    mutliaffiliation_str = tax_and_multiaffi["mutliaffiliation"]
+    
+    
+    sp_hits = set()
+    if taxonomy_str.endswith("Multi-affiliation"):
+        # mutliaffiliation_str = mutliaffiliation_str.replace(
+        #    alternative_fermentum_taxon, "Lactobacillus" )
+        
+        for affi in mutliaffiliation_str.split('|'):
+            
+            sp_hits |= get_corresponding_species_mock({'blast_taxonomy':affi, 'mutliaffiliation':None}, mock_taxonomies)
+          
+
+    else:
+        for mock_tax in mock_taxonomies:
+            if taxonomy_str.startswith(mock_tax):
+                sp_hits.add(mock_tax)
+    
+    return sp_hits
+

 def consider_only_selected_samples(df, all_sample, samples_to_keep):


--- a/scripts/merge_affiliation.py
+++ b/scripts/merge_affiliation.py
-#!/usr/bin/env python3
-
-"""
-Description
-
-:Example:
-python template.py -v
-"""
-
-# Metadata
-__author__ = 'Mainguy Jean - Plateforme biodebugrmatique Toulouse'
-__copyright__ = 'Copyright (C) 2020 INRAE'
-__license__ = 'GNU General Public License'
-__version__ = '0.1'
-__email__ = 'support.biodebug.genotoul@inra.fr'
-__status__ = 'dev'
-
-
-from argparse import ArgumentParser, ArgumentDefaultsHelpFormatter, FileType
-import logging
-import sys
-import frogs_analysis_fct 
-import pandas as pd
-
-def parse_arguments():
-    """Parse script arguments."""
-    parser = ArgumentParser(description="...",
-                            formatter_class=ArgumentDefaultsHelpFormatter)
-
-    parser.add_argument('--table_16S23Sdb', required=True, help='Affiliation abundance table with multiaffi debug of the 16S23S db affiliation, output of the script add_multiaffi_to_abd_table.py ')
-
-    parser.add_argument('--table_16Sdb', required=True, help='Affiliation abundance table with multiaffi debug of the 16S db affiliation, output of the script add_multiaffi_to_abd_table.py ')
-    parser.add_argument('--table_23Sdb', required=True, help='Affiliation abundance table with multiaffi debug of the 23S db affiliation, output of the script add_multiaffi_to_abd_table.py ')
-    
-    parser.add_argument('--min_identity', default=98, type=float, help='Minimal identity to consider an affilition as valid.')
-    parser.add_argument('--min_qcov_23S', default=40, type=float, help='Minimal coverage of a 23S16S  query sequence on 23S subject sequence to consider the affilition valid.')
-    parser.add_argument('--min_qcov_16S', default=30, type=float, help='Minimal coverage of a 23S16S  query sequence on 16S subject sequence to consider the affilition valid.')
-
-    parser.add_argument('--min_qcov_16S23S', default=98, type=float, help='Minimal coverage of a 23S16S query sequence on 16S23S subject sequence to consider the affilition valid.')
-
-
-    parser.add_argument('-o', '--output', default='affiliation_multiple_db_merged.tsv', help='output table name')
-
-
-    parser.add_argument("-v", "--verbose", help="increase output verbosity",
-                        action="store_true")
-    parser.add_argument("--debug", help="increase a lot output verbosity",
-                        action="store_true")
-
-    args = parser.parse_args()
-    return args
-
-def add_db_to_specific_db_columns(df, db_specific_cols, db_name):
-    df = df[db_specific_cols]
-
-    new_col_names = {c:f'{db_name}_{c}' for c in df.columns}
-    df = df.rename(columns=new_col_names)
-    return df
-# iterate through each cluster and unify affiliation 
-# When 16S 23S affiliation is weak, the 16S and 23S is used. 
-from itertools import takewhile
-
-def all_equal(items: (tuple, list, str)) -> bool:
-    '''
-    A helper function to test if 
-    all items in the given iterable 
-    are identical. 
-
-    Arguments:
-    item -> the given iterable to be used
-
-    eg.
-    >>> all_same([1, 1, 1])
-    True
-    >>> all_same([1, 1, 2])
-    False
-    >>> all_same((1, 1, 1))
-    True
-    >> all_same((1, 1, 2))
-    False
-    >>> all_same("111")
-    True
-    >>> all_same("112")
-    False
-    '''
-    return all(item == items[0] for item in items)
-
-def common_prefix(its):
-    return [items[0] for items in takewhile(all_equal, zip(*its))]
-
-def manage_strain_in_taxo(taxonomy):
-    logging.debug(taxonomy)
-    assert len(taxonomy) == 7
-    if taxonomy[-1].count(' ') > 1 and 'metagenome' not in taxonomy[-1]:
-        logging.debug('taxonomy species look like strain')
-        sp = ' '.join(taxonomy[-1].split(' ')[:2])
-        taxonomy = taxonomy[:-1] + [sp, taxonomy[-1]]
-    logging.debug(taxonomy)
-    return taxonomy
-
-def get_common_taxonomy(taxonomies):
-    for t in taxonomies:
-        logging.debug(t)
-        
-    taxonomies_list = (manage_strain_in_taxo(t.split(";")) for t in taxonomies)
-    taxonomy =  common_prefix(taxonomies_list)
-    
-    missing_taxon_count = 7 - len(taxonomy)
-    
-    return ';'.join(taxonomy + ['Multi-affiliation']*missing_taxon_count)
-
-def get_consensus_taxo(affi1, affi2, rank_count):
-    for i in range(rank_count, 0, -1):
-        affi1_i = {';'.join(t.split(';')[:i]) for t in affi1}
-        affi2_i = {';'.join(t.split(';')[:i]) for t in affi2}
-        shared_affi = affi1_i & affi2_i
-        if len(shared_affi) == 0:
-            continue
-
-        missing_taxon_count = rank_count - i
-
-        affi_found_in_both = { ';'.join(taxonomy.split(";") + ['Multi-affiliation']*missing_taxon_count) for taxonomy in shared_affi}
-
-        if len(affi_found_in_both) == 1:
-            return affi_found_in_both.pop()
-        else:
-            return get_common_taxonomy(affi_found_in_both)
-        
-    
-    return  ';'.join(['Multi-affiliation'] * rank_count)
-        
-
-def get_consensus_affi(affi1, affi2):
-    ranks = ["superkingdom", "phylum", "order", "class",
-            "family", "genus", "species"]
-    ranks_count = len(ranks)
-    affi_origin = affi1['affiliation_origin'] + ' & '+ affi2['affiliation_origin']
-    
-    affiliation = {k:None for k in affi1}
-    affiliation['affiliation_origin'] = affi_origin
-    
-    if affi1['blast_taxonomy'] == affi2['blast_taxonomy']:
-        affiliation = {k:v for k, v in affi1.items()}
-    
-    taxonomies = set()
-    
-    
-    if pd.isna(affi1['mutliaffiliation']) :
-        
-        taxonomies.add(affi1['blast_taxonomy'])
-        taxonomies1 = {affi1['blast_taxonomy']}
-    else:
-        taxonomies |= set(affi1['mutliaffiliation'].split('|'))
-        taxonomies1 = set(affi1['mutliaffiliation'].split('|'))
-    
-    if pd.isna(affi2['mutliaffiliation']) :
-    
-        taxonomies.add(affi2['blast_taxonomy'])
-        taxonomies2 = {affi2['blast_taxonomy']}
-    else:
-        taxonomies |= set(affi2['mutliaffiliation'].split('|'))
-        taxonomies2 = set(affi2['mutliaffiliation'].split('|'))
-    
-    taxonomy = get_consensus_taxo(taxonomies1, taxonomies2, ranks_count)
-    #taxonomy = get_common_taxonomy(taxonomies)
-
-    logging.debug('16S affi')
-    [logging.debug(t) for t in taxonomies1]
-    
-    #logging.debug(affi2)
-    logging.debug('23S affi')
-    [logging.debug(t) for t in taxonomies2]
-    logging.debug("CONSENSUS")
-    logging.debug(taxonomy)
-    
-    affiliation['blast_taxonomy'] = taxonomy
-    affiliation['mutliaffiliation'] = '|'.join(taxonomies)
-    affiliation['blast_perc_identity'] = min((affi2['blast_perc_identity'], affi1['blast_perc_identity']))
-    affiliation['blast_perc_query_coverage'] = min((affi2['blast_perc_query_coverage'], affi1['blast_perc_query_coverage']))
-    
-    
-    affiliation['taxon_affi'] = frogs_analysis_fct.get_taxon_affi(taxonomy, ranks)
-    affiliation['rank_affi'] = frogs_analysis_fct.get_rank_affi(taxonomy, ranks)
-    
-    affiliation['affiliation_origin'] =  affi_origin
-    return affiliation
-
-def get_affiliation_from_db(row, db_name):
-    db_row = {k.replace(db_name+'_', ''):v for k, v in row.items() if k.startswith(db_name)}
-    db_row['affiliation_origin'] = db_name
-    return db_row
-
-    
-def unify_affiliation(row, min_id = 98, min_cov_16S23S=99,min_cov_16S = 30, min_cov_23S = 40  ):
-    is_valid = True
-
-
-    is_16S23S_valid = row["16S23Sdb_blast_perc_identity"] >= min_id and row["16S23Sdb_blast_perc_query_coverage"] >= min_cov_16S23S
-    is_16Ssilva_valid = row["16Sdb_blast_perc_identity"] >= min_id and row["16Sdb_blast_perc_query_coverage"] >= min_cov_16S
-    is_23Ssilva_valid = row["23Sdb_blast_perc_identity"] >= min_id and row["23Sdb_blast_perc_query_coverage"] >= min_cov_23S
-    
-    logging.debug(f'is_16S23S_valid {is_16S23S_valid}')
-    logging.debug(f'is_16Ssilva_valid {is_16Ssilva_valid}')
-    logging.debug(f'is_23Ssilva_valid {is_23Ssilva_valid}') 
-    
-    if is_16S23S_valid:
-        affiliation = get_affiliation_from_db(row, '16S23Sdb')
-        
-    elif is_16Ssilva_valid and is_23Ssilva_valid:
-        
-        affiliation16S = get_affiliation_from_db(row, '16Sdb')
-        
-        affiliation23S = get_affiliation_from_db(row, '23Sdb')
-        
-        affiliation = get_consensus_affi(affiliation16S, affiliation23S)
-
-    
-    elif is_16Ssilva_valid and not is_23Ssilva_valid:
-        affiliation = get_affiliation_from_db(row, '16Sdb')
-        
-    elif is_23Ssilva_valid and not is_16Ssilva_valid:
-        affiliation = get_affiliation_from_db(row, '23Sdb')
-        
-    else:
-        # affi is bad every where let's return the 16S23S one.
-        affiliation = get_affiliation_from_db(row, '16S23Sdb')
-        affiliation['affiliation_origin'] = "weak affiliation"
-        is_valid = False
-
-
-    affiliation['is_valid'] = is_valid
-    return affiliation
-
-
-
-
-def main():
-
-    args = parse_arguments()
-
-    if args.debug:
-        logging.basicConfig(format="%(levelname)s: %(message)s", level=logging.DEBUG)
-        logging.debug('Mode debug ON')
-    elif args.verbose:
-        logging.basicConfig(format="%(levelname)s: %(message)s", level=logging.INFO)
-        logging.info('Mode verbose ON')
-
-    else:
-        logging.basicConfig(format="%(levelname)s: %(message)s")
-
-
-    # specific columns that change for each db used for affiliation. 
-    db_specific_cols = ["blast_taxonomy",
-                'blast_evalue',
-                "blast_subject",
-                "blast_perc_identity",
-                "blast_perc_query_coverage",
-                "blast_aln_length",
-                "taxon_affi",
-                "rank_affi",
-                "mutliaffiliation"]
-
-
-    affi_16S23Sdb_table =  args.table_16S23Sdb # "./abundance_and_multiaffi_16S23S_custom.tsv"
-    affi_16Sdb_table =  args.table_16Sdb # "./abundance_and_multiaffi_16Ssilva.tsv"
-    affi_23Sdb_table = args.table_23Sdb # "./abundance_and_multiaffi_23Ssilva.tsv"
-
-    min_id =  args.min_identity # 98
-    min_cov_16S23S =  args.min_qcov_16S23S # 99
-    min_cov_16S = args.min_qcov_16S # 30
-    min_cov_23S = args.min_qcov_23S # 40
-
-    logging.info("Parse affiliation tables")
-    
-    df_affi_16S23Sdb = pd.read_csv(affi_16S23Sdb_table, sep='\t').set_index('seed_id', drop=False)
-
-    df_affi_23Sdb = pd.read_csv(affi_23Sdb_table, sep='\t').set_index('seed_id', drop=False)
-    
-    df_affi_16Sdb = pd.read_csv(affi_16Sdb_table, sep='\t').set_index('seed_id', drop=False)
-    
-    
-    # Create a table with shared column.. so columns that have the same value for all db
-    common_cols = set(df_affi_16Sdb.columns) - set(db_specific_cols)
-    df_common = df_affi_16Sdb[[c for c in df_affi_16Sdb.columns if c in common_cols] ]
-    df_common = df_common.set_index('seed_id', drop=False)
-
-    # add db name to the specific columns
-    df_affi_16Sdb = add_db_to_specific_db_columns(df_affi_16Sdb, db_specific_cols, "16Sdb")
-    df_affi_23Sdb = add_db_to_specific_db_columns(df_affi_23Sdb, db_specific_cols, "23Sdb")
-    df_affi_16S23Sdb = add_db_to_specific_db_columns(df_affi_16S23Sdb, db_specific_cols, "16S23Sdb")
-
-
-    # merge affiliation tables
-    logging.info("concat affiliation tables")
-    df = pd.concat([df_common, 
-                df_affi_16Sdb,
-                df_affi_23Sdb,
-                df_affi_16S23Sdb] , axis=1)
-
-    # Some clusters in 16S23S db could have been removed. Let's check that and log some warning if this is the case
-    df_rm_suspicious = df.loc[df['16S23Sdb_blast_taxonomy'].isna()]
-    if len(df_rm_suspicious):
-        logging.warning(f"{len(df_rm_suspicious)} clusters are not found in 16S-23S affiliation table. These clusters have probably been removed by affilaition filters because they had suspicious affiliation in the 16S3S db.")
-        sum_seq_in_rm_clusters = df_rm_suspicious['observation_sum'].sum()
-        abd_of_rm_clusters = 100 * df_rm_suspicious['observation_sum'].sum()/df['observation_sum'].sum()
-        logging.warning(f"They represent a total of {sum_seq_in_rm_clusters} sequences and a relative abundance of {abd_of_rm_clusters:.3f}%")
-        logging.warning('These clusters are ignored.')
-        # remove cluster that does not appear in 16S23S because it has been removed  by affiliation filtering due to suspicious affiliation
-        df = df.loc[~df['16S23Sdb_blast_taxonomy'].isna()]
-
-    logging.info("unify/merge the affiliations")
-    rows = []
-    for _, row in df.iterrows():
-        logging.debug('=')
-        unified_affi = unify_affiliation(row, min_id, min_cov_16S23S, min_cov_16S, min_cov_23S)
-        
-        otu_debug = {}
-        otu_debug.update(dict(row))
-        otu_debug.update(dict(unified_affi))
-
-        # clean taxonomy --> remove uninformative taxa : taxon with "unknown" or "metagenome" in their name
-        otu_debug['blast_taxonomy_cleaned'] = frogs_analysis_fct.clean_taxonomy(unified_affi['blast_taxonomy'])
-
-        rows.append(otu_debug)
-
-
-    df_merged = pd.DataFrame(rows)
-    df_merged['abundance'] = 100 *  df_merged['observation_sum']/df_merged['observation_sum'].sum()
-
-
-    logging.info(f'Writting merged table in {args.output}')
-    df_merged.to_csv(args.output, sep="\t", index=False)
-
-
-if __name__ == '__main__':
-    main()
--- a/scripts/merge_affiliation_from_multiple_db.py
+++ b/scripts/merge_affiliation_from_multiple_db.py
@@ -36,7 +36,7 @@ def parse_arguments():
    parser.add_argument('--min_qcov_23S', default=40, type=float, help='Minimal coverage of a 23S16S  query sequence on 23S subject sequence to consider the affilition valid.')
    parser.add_argument('--min_qcov_16S', default=30, type=float, help='Minimal coverage of a 23S16S  query sequence on 16S subject sequence to consider the affilition valid.')

-    parser.add_argument('--min_qcov_16S23S', default=98, type=float, help='Minimal coverage of a 23S16S query sequence on 16S23S subject sequence to consider the affilition valid.')
+    parser.add_argument('--min_qcov_16S23S', default=98, type=float, help='Minimal coverage of a 16S-23S query sequence on 16S23S subject sequence to consider the affilition valid.')


    parser.add_argument('-o', '--output', default='affiliation_multiple_db_merged.tsv', help='output table name')
@@ -131,7 +131,6 @@ def get_consensus_taxo(affi1, affi2, rank_count):
        

 def get_consensus_affi(affi1, affi2, ranks):
-
    ranks_count = len(ranks)
    affi_origin = affi1['affiliation_origin'] + ' & '+ affi2['affiliation_origin']
    
@@ -227,7 +226,7 @@ def unify_affiliation(row, ranks, min_id = 98, min_cov_16S23S=99,min_cov_16S = 3
        is_valid = False


-    affiliation['is_valid'] = is_valid
+    affiliation['valid_affiliation'] = is_valid
    return affiliation


@@ -250,6 +249,7 @@ def main():

    # specific columns that change for each db used for affiliation. 
    db_specific_cols = ["blast_taxonomy",
+                "valid_affiliation",
                'blast_evalue',
                "blast_subject",
                "blast_perc_identity",
@@ -313,9 +313,14 @@ def main():
    for _, row in df.iterrows():
        logging.debug('=')
        unified_affi = unify_affiliation(row, ranks, min_id, min_cov_16S23S, min_cov_16S, min_cov_23S)
+        
        otu_debug = {}
        otu_debug.update(dict(row))
        otu_debug.update(dict(unified_affi))
+
+        # clean taxonomy --> remove uninformative taxa : taxon with "unknown" or "metagenome" in their name
+        otu_debug['blast_taxonomy_cleaned'] = frogs_analysis_fct.clean_taxonomy(unified_affi['blast_taxonomy'], ranks)
+
        rows.append(otu_debug)



--- a/scripts/plot_taxo_ranks.py
+++ b/scripts/plot_taxo_ranks.py
@@ -70,26 +70,58 @@ def main():
        logging.basicConfig(format="%(levelname)s: %(message)s")


-    rank2color = {'superkingdom': 'rgb(95, 70, 144)',
-                    "Domain":'rgb(95, 70, 144)',
-                    'phylum': 'rgb(29, 105, 150)',
-                    'class': 'rgb(56, 166, 165)',
-                    'order': 'rgb(15, 133, 84)',
-                    'family': 'rgb(115, 175, 72)',
-                    'genus': 'rgb(237, 173, 8)',
-                    'species': 'rgb(204, 80, 62)',
-                    'Superkingdom': 'rgb(95, 70, 144)',
-                    'Phylum': 'rgb(29, 105, 150)',
-                    'Class': 'rgb(56, 166, 165)',
-                    'Order': 'rgb(15, 133, 84)',
-                    'Family': 'rgb(115, 175, 72)',
-                    'Genus': 'rgb(237, 173, 8)',
-                    'Species': 'rgb(204, 80, 62)', 
-                    "weak affiliation":"grey"}
-
+    # rank2color = {'superkingdom': 'rgb(95, 70, 144)',
+    #                 "Domain":'rgb(95, 70, 144)',
+    #                 'phylum': 'rgb(29, 105, 150)',
+    #                 'class': 'rgb(56, 166, 165)',
+    #                 'order': 'rgb(15, 133, 84)',
+    #                 'family': 'rgb(115, 175, 72)',
+    #                 'genus': 'rgb(237, 173, 8)',
+    #                 'species': 'rgb(204, 80, 62)',
+    #                 'Superkingdom': 'rgb(95, 70, 144)',
+    #                 'Phylum': 'rgb(29, 105, 150)',
+    #                 'Class': 'rgb(56, 166, 165)',
+    #                 'Order': 'rgb(15, 133, 84)',
+    #                 'Family': 'rgb(115, 175, 72)',
+    #                 'Genus': 'rgb(237, 173, 8)',
+    #                 'Species': 'rgb(204, 80, 62)', 
+    #                 "weak affiliation":"grey"}
+    col2code = {"grey":"#999999", 
+                    "orange":"#E69F00",
+                    "ligth_bleu":"#56B4E9",
+                    "green":"#009E73",
+                    "yellow":"#F0E442",
+                    "darker_blue":"#0072B2",
+                    "darker_orange":"#D55E00",
+                    "pink":"#CC79A7"}
+
+    rank2color = {"Domain":col2code['pink'],
+                    'Phylum': col2code['yellow'],
+                    'Class': col2code['ligth_bleu'],
+                    'Order': col2code['darker_blue'],
+                    'Family': col2code['green'],
+                    'Genus': col2code['orange'],
+                    'Species': col2code['darker_orange'],
+                    "weak affiliation":col2code['grey'],}
+    
    ranks = [ "Domain", "Phylum",  "Class", "Order",
                "Family", "Genus", "Species", "Species"]

+# color  1-main 000000   0   0   0 black, grey, grey, grey, rich black, grey, cod grey, grey, almost black, grey
+# color  2-main 2271B2  34 113 178 honolulu blue, bluish, strong cornflower blue, spanish blue, medium persian blue, sapphire blue, ocean boat blue, french blue, windows blue, tufts blue
+# color  2-alt  AA0DB4 170  13 180 barney, strong magenta, heliotrope magenta, strong heliotrope, steel pink, barney purple, purple, violet, violet eggplant, deep magenta
+# color  3-main 3DB7E9  61 183 233 summer sky, cyan, picton blue, vivid cerulean, deep sky blue, brilliant cornflower blue, malibu, bright cerulean, cerulean, cerulean
+# color  3-alt  FF54ED 255  84 237 light magenta, violet pink, light brilliant magenta, pink flamingo, light brilliant orchid, brilliant magenta, purple pizzazz, candy pink, blush pink, shocking pink
+# color  4-main F748A5 247  72 165 barbie pink, rose bonbon, wild strawberry, brilliant rose, brilliant rose, magenta, wild strawberry, light brilliant rose, frostbite, brilliant cerise
+# color  4-alt  00B19F   0 177 159 strong opal, tealish, persian green, keppel, topaz, manganese blue, light sea green, sea green light, puerto rico, turquoise
+# color  5-main 359B73  53 155 115 ocean green, sea green, viridian, mother earth, moderate spring green, moderate aquamarine, paolo veronese green, observatory, jungle green, ocean green
+# color  5-alt  EB057A 235   5 122 vivid rose, red purple, mexican pink, bright pink, rose, strong pink, luminous vivid rose, deep pink, winter sky, hot pink
+# color  6-main d55e00 213  94   0 bamboo, smoke tree, red stage, tawny, tenn, tenne, burnt orange, rusty orange, dark orange, mars yellow
+# color  6-alt  F8071D 248   7  29 vivid red, luminous vivid amaranth, ruddy, ku crimson, vivid amaranth, light brilliant red, cherry red, red, red, bright red
+# color  7-main e69f00 230 159   0 gamboge, squash, buttercup, marigold, dark goldenrod, medium goldenrod, fuel yellow, sun, harvest gold, orange
+# color  7-alt  FF8D1A 255 141  26 dark orange, juicy, west side, tangerine, gold drop, pizazz, princeton orange, university of tennessee orange, tangerine, tahiti gold
+# color  8-main f0e442 240 228  66 holiday, buzz, paris daisy, starship, golden fizz, dandelion, gorse, lemon yellow, bright lights, sunflower
+# color  8-alt  9EFF37 158 255  55 french lime, lime, green yellow, green lizard, luminous vivid spring bud, spring frost, vivid spring bud, bright yellow green, spring bud, acid green


    affi_tables =  args.affi_tables
@@ -144,10 +176,14 @@ def main():
    df_rank['abundance_all_sample_round'] = df_rank['abundance_all_sample_round'] + '<br>' + df_rank['observation_name'].astype(str) + ' clusters'

    fig = px.bar(df_rank, x="name", y="abundance", color="Taxonomic rank", # facet_col="name",
+                 template="seaborn",
                text="abundance_all_sample_round",
                category_orders={"rank_affi":ranks, "name":labels, #"region":regions_analysed,
                                'Taxonomic rank':ranks + ["ident or cov < 99%"] }, color_discrete_map=rank2color, )
-
+    
+    fig.update_layout( # customize font and legend orientation & position
+        legend={'traceorder':'reversed'}
+    )
    
    #fig.update_layout(
    #    width=700,
@@ -199,8 +235,12 @@ def main():
    fig = px.bar(df_rank, x="sample", y="abundance", color="Taxonomic rank", facet_row="n",
             category_orders={"rank_affi":ranks, "n":labels,  # "dataset":dataset_analysed, "region":regions_analysed,
                              'Taxonomic rank':ranks + ["ident or cov < 99%"] },
+                              template="seaborn",
             color_discrete_map=rank2color, )
-
+    
+    fig.update_layout( # customize font and legend orientation & position
+        legend={'traceorder':'reversed'}
+    )
    #fig.update_layout(
    #    width=700,
    #    height=600,)
No results found