---
directories:
    work_dir: /usr/local/www/data/private/Arachis/GENUS
    from_pan_dir: out_Arachis_5_1
prefixes:
    from_pan_prefix: 
collection_info:
    genus: Arachis
    pan_ver: pan1
    pan_key: 4LN9
readme_info:
    provenance: "The files in this directory are a product of the staff of the LegumeInfo, PeanutBase, and SoyBase project teams. The method is described here: https://github.com/legumeinfo/pandagma"
    source: "https://data.legumeinfo.org"
    synopsis: "Pangene set for Arachis species, based on A. hypogaea, A. duranensis, and A. ipaensis. This pangene set includes five annotation sets from A. hypogaea and one each from A. duranensis and A. ipaensis."
    taxid: "3826"
    annotations_main: arahy.BaileyII.gnm1.ann1,arahy.Tifrunner.gnm1.ann1,arahy.Tifrunner.gnm1.ann2,arahy.Tifrunner.gnm2.ann1,arahy.Tifrunner.gnm2.ann2
    annotations_extra: aradu.V14167.gnm1.ann1,araip.K30076.gnm1.ann1
    description: "Pan-gene set for Arachis species, spanning 3 species and 6 annotation sets, calculated using the pandagma pipeline, version 2023-04-03"
    bioproject: 
    sraproject: 
    dataset_doi_genome: 
    dataset_doi_annot: 
    genbank_accession: 
    original_file_creation_date: 2023-04-03
    local_file_creation_date: 2023-04-03
    dataset_release_date: 2023-04-03
    contributors: The International Peanut Genome Initiative and authors of all constituent genomes and annotations
    publication_doi: 
    citation: 
    publication_title: 
    data_curators: Steven Cannon, Andrew Farmer
    public_access_level: public
    license: Open
    keywords: "Arachis, peanut, pan-gene, pangene, orthogroup"
from_to_pan_tsv:
  - 
    from: 18_syn_pan_aug_extra.clust.tsv
    to: clust.tsv
    description: "Pan-gene sets, in cluster format: ID in first column, followed by tab-separated gene list."
  - 
    from: 18_syn_pan_aug_extra.counts.tsv
    to: counts.tsv
    description: "Matrix of counts of genes per annotation set for each pan-gene set."
  - 
    from: 18_syn_pan_aug_extra.hsh.tsv
    to: hsh.tsv
    description: "Pan-gene sets, in a two-column hash format, with the set ID in the first column and genes in the second."
from_to_pan_fasta:
  - 
    from: 21_pan_fasta_clust_rep_cds.fna
    to: inclusive_cds.fna
    description: "CDS pan-gene sequence, inclusive (not filtered by minimum cluster size or annotation-set representation)."
  - 
    from: 21_pan_fasta_clust_rep_prot.faa
    to: inclusive_protein.faa
    description: "Protein pan-gene sequence, inclusive (not filtered by minimum cluster size or annotation-set representation)."
  - 
    from: 23_syn_pan_pctl25_posn_cds.fna
    to: pctl25_named_cds.fna
    strip: '\w+\.pan\d+\.'
    description: "CDS pan-gene sequence, omitting pan-genes smaller than 25% of the mode, with derived pan-gene IDs corresponding with consensus chromosome and ordinal position."
  - 
    from: 23_syn_pan_pctl25_posn_prot.faa
    to: pctl25_named_protein.faa
    strip: '\w+\.pan\d+\.'
    description: "Protein pan-gene sequence, omitting pan-genes smaller than 25% of the mode, with derived pan-gene IDs corresponding with consensus chromosome and ordinal position."
from_to_pan_as_is:
  - 
    from: 18_syn_pan_aug_extra_complement.fna
    to: complement.fna
    description: "Complement of genes in this pan-gene set; i.e. not clustered, presumed to be singletons."
  - 
    from: stats.Arachis_5_1.txt
    to: stats.txt
    description: "Descriptive statistics about program parameters, input sequences, and pan-gene products."