from https://www.whitelupin.fr/download.html wget https://www.whitelupin.fr/genome_files/Lalbus-20171117r1.genome.fasta wget https://www.whitelupin.fr/genome_files/mitochondrion/Lalb_mitochondrion.zip wget https://www.whitelupin.fr/genome_files/chloroplast/Lalb_chloroplast.zip wget https://www.whitelupin.fr/genome_files/Lalbus-20171117r1-v1.annot.gff3 wget https://www.whitelupin.fr/genome_files/ShortStack_Clusters_All.gff3 wget https://www.whitelupin.fr/genome_files/4-DANTE-tracks.gff wget https://www.whitelupin.fr/genome_files/Lalbus-satDNA-annotation-final.gff wget https://www.whitelupin.fr/genome_files/Lalbus-rDNA-annotation-final.gff wget https://www.whitelupin.fr/genome_files/gene.fasta wget https://www.whitelupin.fr/genome_files/prot.fasta wget https://www.whitelupin.fr/genome_files/cds.fasta wget https://www.whitelupin.fr/genome_files/mrna.fasta wget https://www.whitelupin.fr/genome_files/ncrna.fasta cat Lalbus-20171117r1.genome.fasta Lalb_chloroplast.fasta Lalb_mitochondrion.fasta > genome.fasta normalizer prefix --target genome.fasta --genus Lupinus --species albus --gnm 1 --infra_id Amiga --unique_key F4NR normalizer index --target Lupinus_albus/Amiga.gnm1.F4NR/lupal.Amiga.gnm1.F4NR.genome_main.fna #mostly this is addressing various complaints of chado gff loader, and making best effort to #get records linked to appropriate parents cat Lalb_chloroplast.gff | sed 's/NCBI Feature Key/ncbi_feature_key/; s/NCBI Join Type/ncbi_join_type/; s/Transferred /transferred_/g; s/\t\(gene\t.*\)pseudo=;/\tpseudo\1/; /pseudogene/ s/\) { my ($id) = /ID=([^;]*)/; my $count = $ids{$id}; $count++; $ids{$id} = $count; $id .= "_$count"; s/ID=[^;]*/ID=$id/; my ($parent) = /Parent=([^;]*)/; if (defined $parent) { my $count = $ids{$parent}; $parent .= "_$count"; s/Parent=[^;]*/Parent=$parent/; } print; } ' | /erdos/adf/sw/hacks/gff_fixphase.pl | sort_gff.pl > chloro_for_gt #similar to above, but much less custom munging needed here cat Lalb_mitochondrion.gff3 | sed 's/ID=\([^;]*\)/ID=\1;Name=lupal.\1/; /CDS/ s/ID=cds\([^;]*\)/ID=cds\1;Parent=gene\1/;' | /erdos/adf/sw/hacks/gff_fixphase.pl | sort_gff.pl > mito_for_gt #mostly addressing our naming convention additions, specifically things that are currently #outside the scope of the ID prefixing done by the "normalizer" cat Lalbus-20171117r1-v1.annot.gff3 | sed ' s/EC_number=/ec_number=/; s/ID=mRNA:\([^;]*\)/ID=mRNA:\1.1/; s/Parent=mRNA:\([^;]*\)/Parent=mRNA:\1.1/; s/Name=\([^;]*\),\(Lalb[^;,]*\)/Name=\2;Alias=\1/; s/Name=\([^;]*\)/Name=lupal.\1/; /\tmRNA\t/ s/Name=\([^;]*\)/Name=\1.1/; ' > nuclear_for_gt #putting it all together cat nuclear_for_gt <(grep -vh '##gff-version' chloro_for_gt mito_for_gt) > combined.gff3 normalizer prefix --target combined.gff3 --genus Lupinus --species albus --gnm 1 --ann 0 --infra_id Amiga --unique_key 3GKS perl -p -e 's/>([^\t]*)/>lupal.Amiga.gnm1.ann1.mRNA:\1.1/' mrna.fasta > lupal.Amiga.gnm1.ann1.3GKS.mRNA.fna perl -p -e 's/>([^\t]*)/>lupal.Amiga.gnm1.ann1.mRNA:\1.1/' prot.fasta > lupal.Amiga.gnm1.ann1.3GKS.protein.faa perl -p -e 's/>([^\t]*)/>lupal.Amiga.gnm1.ann1.mRNA:\1.1/' cds.fasta > lupal.Amiga.gnm1.ann1.3GKS.cds.fna