from https://www.whitelupin.fr/download.html
wget https://www.whitelupin.fr/genome_files/Lalbus-20171117r1.genome.fasta
wget https://www.whitelupin.fr/genome_files/mitochondrion/Lalb_mitochondrion.zip
wget https://www.whitelupin.fr/genome_files/chloroplast/Lalb_chloroplast.zip
wget https://www.whitelupin.fr/genome_files/Lalbus-20171117r1-v1.annot.gff3
wget https://www.whitelupin.fr/genome_files/ShortStack_Clusters_All.gff3
wget https://www.whitelupin.fr/genome_files/4-DANTE-tracks.gff
wget https://www.whitelupin.fr/genome_files/Lalbus-satDNA-annotation-final.gff
wget https://www.whitelupin.fr/genome_files/Lalbus-rDNA-annotation-final.gff
wget https://www.whitelupin.fr/genome_files/gene.fasta
wget https://www.whitelupin.fr/genome_files/prot.fasta
wget https://www.whitelupin.fr/genome_files/cds.fasta
wget https://www.whitelupin.fr/genome_files/mrna.fasta
wget https://www.whitelupin.fr/genome_files/ncrna.fasta
cat Lalbus-20171117r1.genome.fasta Lalb_chloroplast.fasta Lalb_mitochondrion.fasta > genome.fasta
normalizer prefix --target genome.fasta --genus Lupinus --species albus --gnm 1 --infra_id Amiga --unique_key F4NR
normalizer index --target Lupinus_albus/Amiga.gnm1.F4NR/lupal.Amiga.gnm1.F4NR.genome_main.fna
#mostly this is addressing various complaints of chado gff loader, and making best effort to
#get records linked to appropriate parents
cat Lalb_chloroplast.gff | sed 's/NCBI Feature Key/ncbi_feature_key/;
s/NCBI Join Type/ncbi_join_type/;
s/Transferred /transferred_/g;
s/\t\(gene\t.*\)pseudo=;/\tpseudo\1/;
/pseudogene/ s/\) {
my ($id) = /ID=([^;]*)/;
my $count = $ids{$id};
$count++;
$ids{$id} = $count;
$id .= "_$count";
s/ID=[^;]*/ID=$id/;
my ($parent) = /Parent=([^;]*)/;
if (defined $parent) {
my $count = $ids{$parent};
$parent .= "_$count";
s/Parent=[^;]*/Parent=$parent/;
}
print;
}
' | /erdos/adf/sw/hacks/gff_fixphase.pl | sort_gff.pl > chloro_for_gt
#similar to above, but much less custom munging needed here
cat Lalb_mitochondrion.gff3 | sed 's/ID=\([^;]*\)/ID=\1;Name=lupal.\1/; /CDS/ s/ID=cds\([^;]*\)/ID=cds\1;Parent=gene\1/;' | /erdos/adf/sw/hacks/gff_fixphase.pl | sort_gff.pl > mito_for_gt
#mostly addressing our naming convention additions, specifically things that are currently
#outside the scope of the ID prefixing done by the "normalizer"
cat Lalbus-20171117r1-v1.annot.gff3 | sed '
s/EC_number=/ec_number=/;
s/ID=mRNA:\([^;]*\)/ID=mRNA:\1.1/;
s/Parent=mRNA:\([^;]*\)/Parent=mRNA:\1.1/;
s/Name=\([^;]*\),\(Lalb[^;,]*\)/Name=\2;Alias=\1/;
s/Name=\([^;]*\)/Name=lupal.\1/;
/\tmRNA\t/ s/Name=\([^;]*\)/Name=\1.1/;
' > nuclear_for_gt
#putting it all together
cat nuclear_for_gt <(grep -vh '##gff-version' chloro_for_gt mito_for_gt) > combined.gff3
normalizer prefix --target combined.gff3 --genus Lupinus --species albus --gnm 1 --ann 0 --infra_id Amiga --unique_key 3GKS
perl -p -e 's/>([^\t]*)/>lupal.Amiga.gnm1.ann1.mRNA:\1.1/' mrna.fasta > lupal.Amiga.gnm1.ann1.3GKS.mRNA.fna
perl -p -e 's/>([^\t]*)/>lupal.Amiga.gnm1.ann1.mRNA:\1.1/' prot.fasta > lupal.Amiga.gnm1.ann1.3GKS.protein.faa
perl -p -e 's/>([^\t]*)/>lupal.Amiga.gnm1.ann1.mRNA:\1.1/' cds.fasta > lupal.Amiga.gnm1.ann1.3GKS.cds.fna