---
directories:
    work_dir: /usr/local/www/data/private/Glycine/GENUS
    from_pan_dir: out_pandagma
prefixes:
    from_pan_prefix: 
collection_info:
    genus: Glycine
    pan_ver: pan5
    pan_key: MKRS
readme_info:
    provenance: "The files in this directory are a product of the staff of the SoyBase and LegumeInfo project teams. The method is described here: https://github.com/legumeinfo/pandagma"
    source: "https://data.legumeinfo.org"
    synopsis: "Pangene set for Glycine species, based on G. max and G. soja but also including 6 perennial Australian species by homology. This pangene set includes 56 Glycine annotation sets."
    taxid: "3846"
    annotations_main: glyma.FiskebyIII.gnm1.ann1, glyma.Hefeng25_IGA1002.gnm1.ann1, glyma.Huaxia3_IGA1007.gnm1.ann1, glyma.JD17.gnm1.ann1, glyma.Jinyuan_IGA1006.gnm1.ann1, glyma.Lee.gnm1.ann1, glyma.Lee.gnm2.ann1, glyma.Lee.gnm3.ann1, glyma.Wenfeng7_IGA1001.gnm1.ann1, glyma.Wm82.gnm2.ann1, glyma.Wm82.gnm4.ann1, glyma.Wm82.gnm5.ann1, glyma.Wm82.gnm6.ann1, glyma.Wm82_IGA1008.gnm1.ann1, glyma.Wm82_NJAU.gnm1.ann1, glyma.Zh13_IGA1005.gnm1.ann1, glyma.Zh13.gnm1.ann1, glyma.Zh35_IGA1004.gnm1.ann1, glyso.F_IGA1003.gnm1.ann1, glyso.PI483463.gnm1.ann1, glyso.W05.gnm1.ann1
    annotations_extra: glyma.58-161.gnm1.ann1, glyma.Amsoy.gnm1.ann1, glyma.DongNongNo_50.gnm1.ann1, glyma.FengDiHuang.gnm1.ann1, glyma.HanDouNo_5.gnm1.ann1, glyma.HeiHeNo_43.gnm1.ann1, glyma.Hwangkeum.gnm1.ann1, glyma.JiDouNo_17.gnm1.ann1, glyma.JinDouNo_23.gnm1.ann1, glyma.JuXuanNo_23.gnm1.ann1, glyma.KeShanNo_1.gnm1.ann1, glyma.PI_398296.gnm1.ann1, glyma.PI_548362.gnm1.ann1, glyma.QiHuangNo_34.gnm1.ann1, glyma.ShiShengChangYe.gnm1.ann1, glyma.TieFengNo_18.gnm1.ann1, glyma.TieJiaSiLiHuang.gnm1.ann1, glyma.TongShanTianEDan.gnm1.ann1, glyma.WanDouNo_28.gnm1.ann1, glyma.Wm82.gnm1.ann1, glyma.Wm82_ISU01.gnm2.ann1, glyma.XuDouNo_1.gnm1.ann1, glyma.YuDouNo_22.gnm1.ann1, glyma.Zh13.gnm2.ann1, glyma.ZhangChunManCangJin.gnm1.ann1, glyma.Zhutwinning2.gnm1.ann1, glyma.ZiHuaNo_4.gnm1.ann1, glyso.PI_549046.gnm1.ann1, glyso.PI_562565.gnm1.ann1, glyso.PI_578357.gnm1.ann1, glycy.G1267.gnm1.ann1, glyd3.G1403.gnm1.ann1, glydo.G1134.gnm1.ann1, glyfa.G1718.gnm1.ann1, glyst.G1974.gnm1.ann1, glysy.G1300.gnm1.ann1
    description: "Pan-gene set for Glycine species, spanning 8 species and 56 annotation sets, calculated using the pandagma pipeline, version 2.0 (February, 2024)"
    bioproject: 
    sraproject: 
    dataset_doi_genome: 
    dataset_doi_annot: 
    genbank_accession: 
    original_file_creation_date: 2024-02-26
    local_file_creation_date: 2024-02-26
    dataset_release_date: 2024-02-26
    contributors: Steven Cannon
    publication_doi: 
    citation: 
    publication_title: 
    data_curators: Steven Cannon
    public_access_level: public
    license: Open
    keywords: "soybean, Glycine, pan-gene, pangene, orthogroup"
from_to_pan_tsv:
  - 
    from: 18_syn_pan_aug_extra.clust.tsv
    to: clust.tsv
    description: "Pan-gene sets, in cluster format: ID in first column, followed by tab-separated gene list."
  - 
    from: 18_syn_pan_aug_extra.counts.tsv
    to: counts.tsv
    description: "Matrix of counts of genes per annotation set for each pan-gene set."
  - 
    from: 18_syn_pan_aug_extra.hsh.tsv
    to: hsh.tsv
    description: "Pan-gene sets, in a two-column hash format, with the set ID in the first column and genes in the second."
  -
    from: 18_syn_pan_aug_extra.table.tsv
    to: table.tsv
    description: "Table of genes in each family or pangene, in columns by accession"
from_to_pan_fasta:
  - 
    from: 21_pan_fasta_clust_rep_cds.fna
    to: inclusive_cds.fna
    description: "CDS pan-gene sequence, inclusive (not filtered by minimum cluster size or annotation-set representation)."
  - 
    from: 21_pan_fasta_clust_rep_prot.faa
    to: inclusive_protein.faa
    description: "Protein pan-gene sequence, inclusive (not filtered by minimum cluster size or annotation-set representation)."
  - 
    from: 23_syn_pan_pctl25_posn_cds.fna
    to: pctl25_named_cds.fna
    strip: '\w+\.pan\d+\.'
    description: "CDS pan-gene sequence, omitting pan-genes smaller than 25% of the mode, with derived pan-gene IDs corresponding with consensus chromosome and ordinal position."
  - 
    from: 23_syn_pan_pctl25_posn_prot.faa
    to: pctl25_named_protein.faa
    strip: '\w+\.pan\d+\.'
    description: "Protein pan-gene sequence, omitting pan-genes smaller than 25% of the mode, with derived pan-gene IDs corresponding with consensus chromosome and ordinal position."
from_to_pan_as_is:
  - 
    from: 18_syn_pan_aug_extra_complement.fna
    to: complement.fna
    description: "Complement of genes in this pan-gene set; i.e. not clustered, presumed to be singletons."
  - 
    from: stats.txt
    to: stats.txt
    description: "Descriptive statistics about program parameters, input sequences, and pan-gene products."