#!/usr/bin/env perl

use warnings;
# vi: set ts=4 sw=4 :
#____________________________________________________________________________
#
#   MusicBrainz -- the open internet music database
#
#   Copyright (C) 1998 Robert Kaye
#
#   This program is free software; you can redistribute it and/or modify
#   it under the terms of the GNU General Public License as published by
#   the Free Software Foundation; either version 2 of the License, or
#   (at your option) any later version.
#
#   This program is distributed in the hope that it will be useful,
#   but WITHOUT ANY WARRANTY; without even the implied warranty of
#   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#   GNU General Public License for more details.
#
#   You should have received a copy of the GNU General Public License
#   along with this program; if not, write to the Free Software
#   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
#
#   $Id$
#____________________________________________________________________________

use FindBin;
use lib "$FindBin::Bin/../lib";

use strict;
use DBDefs;
use integer;

use MusicBrainz::Server::Constants qw( @FULL_TABLE_LIST );

use Getopt::Long;
use Encode qw( encode );

################################################################################

my @tables = @FULL_TABLE_LIST;

my @replication_tables = qw(
    dbmirror_pending
    dbmirror_pendingdata
);

my @group_core = qw(
    area
    area_type
    area_alias
    country_area
    area_alias_type
    area_gid_redirect
    iso_3166_1
    iso_3166_2
    iso_3166_3
    artist
    artist_alias
    artist_alias_type
    artist_credit
    artist_credit_name
    artist_gid_redirect
    artist_ipi
    artist_isni
    artist_type
    cdtoc
    editor_collection_type
    event
    event_alias
    event_alias_type
    event_gid_redirect
    event_type
    gender
    instrument
    instrument_alias
    instrument_alias_type
    instrument_annotation
    instrument_gid_redirect
    instrument_type
    isrc
    iswc
    l_area_area
    l_area_artist
    l_area_event
    l_area_instrument
    l_area_label
    l_area_place
    l_area_recording
    l_area_release
    l_area_release_group
    l_area_series
    l_area_url
    l_area_work
    l_artist_artist
    l_artist_event
    l_artist_instrument
    l_artist_label
    l_artist_place
    l_artist_recording
    l_artist_release
    l_artist_release_group
    l_artist_series
    l_artist_url
    l_artist_work
    l_event_event
    l_event_instrument
    l_event_label
    l_event_place
    l_event_recording
    l_event_release
    l_event_release_group
    l_event_series
    l_event_url
    l_event_work
    l_instrument_instrument
    l_instrument_label
    l_instrument_place
    l_instrument_recording
    l_instrument_release
    l_instrument_release_group
    l_instrument_series
    l_instrument_url
    l_instrument_work
    l_label_label
    l_label_place
    l_label_recording
    l_label_release
    l_label_release_group
    l_label_series
    l_label_url
    l_label_work
    l_place_place
    l_place_recording
    l_place_release
    l_place_release_group
    l_place_series
    l_place_url
    l_place_work
    l_recording_recording
    l_recording_release
    l_recording_release_group
    l_recording_series
    l_recording_url
    l_recording_work
    l_release_group_release_group
    l_release_group_series
    l_release_group_url
    l_release_group_work
    l_release_release
    l_release_release_group
    l_release_series
    l_release_url
    l_release_work
    l_series_series
    l_series_url
    l_series_work
    l_url_url
    l_url_work
    l_work_work
    label
    label_alias
    label_alias_type
    label_gid_redirect
    label_ipi
    label_isni
    label_type
    language
    link
    link_attribute
    link_attribute_credit
    link_attribute_text_value
    link_attribute_type
    link_creditable_attribute_type
    link_text_attribute_type
    link_type
    link_type_attribute_type
    medium
    medium_cdtoc
    medium_format
    orderable_link_type
    place
    place_alias
    place_alias_type
    place_gid_redirect
    place_type
    recording
    recording_gid_redirect
    release
    release_country
    release_gid_redirect
    release_group
    release_group_gid_redirect
    release_group_primary_type
    release_group_secondary_type
    release_group_secondary_type_join
    release_label
    release_packaging
    release_status
    release_unknown_country
    replication_control
    series
    series_alias
    series_alias_type
    series_gid_redirect
    series_ordering_type
    series_type
    script
    track
    track_gid_redirect
    tracklist
    url
    url_gid_redirect
    work
    work_alias
    work_alias_type
    work_attribute
    work_attribute_type
    work_attribute_type_allowed_value
    work_gid_redirect
    work_type
    );

my @group_derived = qw(
    annotation
    area_annotation
    area_tag
    artist_annotation
    artist_meta
    artist_tag
    event_annotation
    event_tag
    instrument_tag
    label_annotation
    label_meta
    label_tag
    place_annotation
    place_tag
    recording_annotation
    recording_meta
    recording_tag
    release_annotation
    release_meta
    release_group_annotation
    release_group_meta
    release_group_tag
    release_tag
    series_annotation
    series_tag
    tag
    tag_relation
    medium_index
    work_annotation
    work_meta
    work_tag
    );

my @group_stats = grep { $_ =~ /^statistics\./ } @FULL_TABLE_LIST;

my @group_editor = qw(
    editor_sanitised
    );

my @group_edit = qw(
    edit
    edit_area
    edit_artist
    edit_event
    edit_instrument
    edit_label
    edit_note
    edit_place
    edit_recording
    edit_release
    edit_release_group
    edit_series
    edit_url
    edit_work
    vote
    );

my @group_private = qw(
    application
    area_tag_raw
    artist_rating_raw
    artist_tag_raw
    autoeditor_election
    autoeditor_election_vote
    editor
    editor_collection
    editor_collection_event
    editor_collection_release
    editor_language
    editor_oauth_token
    editor_preference
    editor_subscribe_artist
    editor_subscribe_collection
    editor_subscribe_editor
    editor_subscribe_label
    editor_subscribe_series
    editor_watch_artist
    editor_watch_preferences
    editor_watch_release_group_type
    editor_watch_release_status
    event_tag_raw
    instrument_tag_raw
    label_rating_raw
    label_tag_raw
    place_tag_raw
    recording_rating_raw
    recording_tag_raw
    release_coverart
    release_group_rating_raw
    release_group_tag_raw
    release_tag_raw
    series_tag_raw
    work_rating_raw
    work_tag_raw
);

my @group_cdstubs = qw(
    cdtoc_raw
    release_raw
    track_raw
);

my @group_cover_art_archive = grep { $_ =~ /^cover_art_archive\./ } @FULL_TABLE_LIST;

my @group_wikidocs = grep { $_ =~ /^wikidocs\./ } @FULL_TABLE_LIST;

my @group_documentation = grep { $_ =~ /^documentation\./ } @FULL_TABLE_LIST;

my @groups = (
    \@group_core,
    \@group_derived,
    \@group_stats,
    \@group_editor,
    \@group_edit,
    \@group_private,
    \@group_cdstubs,
    \@group_cover_art_archive,
    \@group_wikidocs,
    \@group_documentation
);

################################################################################

my $fHelp;
my $OutputDir = ".";
my $dir = "/tmp";
my $fCompress = 1;
my $fKeepFiles = 0;
my $fProgress = -t STDOUT;
my $fDoFullExport = 1;
my $fDoReplication = 0;
my $fCheckCompleteness = 0;
my $fCallback;
my @tablelist;

GetOptions(
    "output-dir|d=s"    => \$OutputDir,
    "tmp-dir|t=s"               => \$dir,
    "compress|c!"               => \$fCompress,
    "keep-files|k!"             => \$fKeepFiles,
    "table=s"                   => \@tablelist,
    "replication-callback=s"    => \$fCallback,
    "with-replication"          => sub { $fDoReplication = 1 },
    "without-replication"       => sub { $fDoReplication = 0 },
    "with-full-export"          => sub { $fDoFullExport = 1 },
    "without-full-export"       => sub { $fDoFullExport = 0 },
    "check-completeness"        => sub { $fCheckCompleteness = 1 },
    "help"                              => \$fHelp,
);

my $encrypt_for = DBDefs->GPG_ENCRYPT_KEY || undef;
my $sign_with = DBDefs->GPG_SIGN_KEY || undef;

sub usage
{
    print <<EOF;
Usage: ExportAllTables [options]

        --help            show this help
    -d, --output-dir DIR  place the final archive files in DIR (default: ".")
    -t, --tmp-dir DIR     use DIR for temporary storage (default: /tmp)
    -c, --[no]compress    [don't] create .tar.bz2 archives after exporting
    -k, --keep-files      don't delete the exported files from the tmp directory
        --table TABLE     process only these tables
        --with[out]-replication  Do [not] produce a replication packet
        --with[out]-full-export  Do [not] export full tables
        --check-completeness     Check that all tables from DB schema are exported

Certain combinations of options are pointless:
 * specifying --without-replication and --without-full-export
 * specifying tables (via --table) and --without-full-export
 * specifying --nocompress and omitting --keep-files

If you specify --table TABLE, you won't get a complete consistent snapshot
of the database, of course.

GPG encryption of private data requires you pass a recipient to encrypt data
for. This value is taken from DBDefs and can take any value that gpg(1) can
take for `--recipient`. The user running ExportAllTables must have a GPG
keyring setup that is aware of this recipient. GPG signing works similarly.

Encryption uses DBDefs->GPG_ENCRYPT_KEY and signing uses DBDefs->GPG_SIGN_KEY.

EOF
}

usage(), exit if $fHelp;
usage(), exit 1 if @ARGV;
usage(), exit 1 if not $fCompress and not $fKeepFiles;
usage(), exit 1 if not $fDoFullExport and not $fDoReplication;
usage(), exit 1 if not $fDoFullExport and @tablelist;
check_tables_completeness(), exit if $fCheckCompleteness;

my $erase_tmpdir_on_exit = 0;
END
{
    if ($erase_tmpdir_on_exit and not $fKeepFiles and defined($dir) and -d $dir and -d "$dir/mbdump")
    {
        print localtime() . " : Disk space just before erasing tmp dir:\n";
        system "/bin/df -m";
        print localtime() . " : Erasing $dir\n";
        system "/bin/rm", "-rf", $dir;
    }
}

use File::Temp qw( tempdir );
$SIG{'INT'} = sub { exit 3 };
$dir = tempdir("mbexport-XXXXXX", DIR => $dir, CLEANUP => 0);
$erase_tmpdir_on_exit = 1;
mkdir "$dir/mbdump" or die $!;
print localtime() . " : Exporting to $dir\n";

use MusicBrainz::Server::Context;
use Sql;

# Log in to the main DB
my $c = MusicBrainz::Server::Context->new;
my $sql = Sql->new($c->conn);
my $dbh = $c->dbh;

@tables = @tablelist if @tablelist;
@tables = () if not $fDoFullExport;

use Time::HiRes qw( gettimeofday tv_interval );
my $t0 = [gettimeofday];
my $totalrows = 0;
my $tables = 0;

# A quick discussion of the "Can't serialize access due to concurrent update"
# problem.  See "transaction-iso.html" in the Postgres documentation.
# Basically the problem is this: export "A" starts; export "B" starts; export
# "B" updates replication_control; export "A" then can't update
# replication_control, failing with the above error.
# The solution is to get a lock (outside of the database) before we start the
# serializable transaction.
open(my $lockfh, ">>/tmp/.mb-export-lock") or die $!;
use Fcntl qw( LOCK_EX );
flock($lockfh, LOCK_EX) or die $!;

$sql->auto_commit;
$sql->do("SET SESSION CHARACTERISTICS AS TRANSACTION ISOLATION LEVEL SERIALIZABLE");
$sql->begin;

my $now = $sql->select_single_value("SELECT NOW()");

# Write the TIMESTAMP file
# This used to be free text; now it's parseable.  It contains a PostgreSQL
# TIMESTAMP WITH TIME ZONE expression.
writefile("TIMESTAMP", "$now\n");

# Get the replication control data

my ($iSchemaSequence, $iReplicationSequence, $dtReplicationDate) = do {
    my $row = $sql->select_single_row_hash("SELECT * FROM replication_control");
    $row ||= {};
    @$row{qw(
        current_schema_sequence
        current_replication_sequence
        last_replication_date
    )};
};

$iSchemaSequence or die "Don't know what schema sequence number we're using";
$iSchemaSequence == DBDefs->DB_SCHEMA_SEQUENCE
    or die "Stored schema sequence ($iSchemaSequence) does not match DBDefs->DB_SCHEMA_SEQUENCE (".DBDefs->DB_SCHEMA_SEQUENCE.")";

# Write the SCHEMA_SEQUENCE file.  Again, this is parseable - it's just an
# integer.
writefile("SCHEMA_SEQUENCE", "$iSchemaSequence\n");

# Sanitise various things for public consumption

if (grep { $_ eq "editor_sanitised" } @tables)
{
    $sql->do("SELECT id,
                name,
                0 AS privs,
                '' AS email,
                NULL AS website,
                NULL AS bio,
                member_since,
                email_confirm_date,
                now() AS last_login_date,
                edits_accepted,
                edits_rejected,
                auto_edits_accepted,
                edits_failed,
                last_updated,
                NULL AS birth_date,
                NULL AS gender,
                NULL as area,
                '{CLEARTEXT}mb' AS password,
                md5(name || ':musicbrainz.org:mb') AS ha1,
                deleted
              INTO TEMPORARY editor_sanitised
              FROM editor");
}

$| = 1;

printf "%-30.30s %9s %4s %9s\n",
    "Table", "Rows", "est%", "rows/sec",
    ;

my %rowcounts;

for my $table (@tables)
{
    dumptable($table);
}

sub table_rowcount
{
    my $table = shift;
    $table =~ s/_sanitised$//;

    $table =~ s/.*\.//;

    $sql->select_single_value(
        "SELECT reltuples FROM pg_class WHERE relname = ? LIMIT 1",
        $table,
    );
}

sub dumptable
{
    my $table = shift;

    open(DUMP, ">$dir/mbdump/$table")
        or die $!;

    my $estrows = table_rowcount($table) || 1;

    $sql->do("COPY $table TO stdout");
    my $buffer;
    my $rows = 0;

    my $t1 = [gettimeofday];
    my $interval;

    my $p = sub {
        my ($pre, $post) = @_;
        no integer;
        printf $pre."%-30.30s %9d %3d%% %9d".$post,
                $table, $rows, int(100 * $rows / $estrows),
                $rows / ($interval||1);
    };

    $p->("", "") if $fProgress;

    my $longest = 0;

    while ($dbh->pg_getcopydata($buffer) >= 0)
    {
        $longest = length($buffer) if length($buffer) > $longest;
        print DUMP encode('utf-8', $buffer)
                or die $!;

        ++$rows;
        unless ($rows & 0xFFF)
        {
                $interval = tv_interval($t1);
                $p->("\r", "") if $fProgress;
        }
    }

    close DUMP
        or die $!;

    $interval = tv_interval($t1);
    $p->(($fProgress ? "\r" : ""), sprintf(" %.2f sec\n", $interval));
    print "Longest buffer used: $longest\n" if $ENV{SHOW_BUFFER_SIZE};

    ++$tables;
    $totalrows += $rows;
    $rowcounts{$table} = $rows;

    $rows;
}

if ($fDoReplication)
{
    # Is there any replication data to dump?
    my $fAnyReplicationData = 0;
    for my $table (@replication_tables)
    {
        $fAnyReplicationData = 1
            if $sql->select_single_value("SELECT COUNT(*) FROM $table") > 0;
    }

    if ($fAnyReplicationData or not $iReplicationSequence)
    {
        # Are we starting replication for the first time?
        if (not defined $iReplicationSequence)
        {
                # If we're currently not at a replication point, then let's turn this
                # one into sequence #0.
                $iReplicationSequence = 0;

                # In this case we may as well ditch any pending
                # replication data; no-one can ever load and apply packet #0, so it
                # will effectively just be there for cosmetic reasons.
                empty_replication_tables();
        } else {
                # Otherwise, we're moving on to the next replication sequence
                # number.
                ++$iReplicationSequence;
        }

        # Set the replication sequence number.  This should (I think) be
        # the final transaction included in each packet.
        $iReplicationSequence ||= 0;

        $sql->do(
                "UPDATE replication_control
                SET current_replication_sequence = ?,
                last_replication_date = NOW()",
                $iReplicationSequence,
        );

        print localtime() . " : Producing replication packet #$iReplicationSequence\n";

        # Dump 'em
        dumptable($_) for @replication_tables;

        # Remove the rows we just dumped
        empty_replication_tables();

        writefile("REPLICATION_SEQUENCE", "$iReplicationSequence\n");
    } elsif (defined $iReplicationSequence and not $fAnyReplicationData) {
        print localtime() . " : No changes since the last replication point (#$iReplicationSequence)\n";
        writefile("REPLICATION_SEQUENCE", "$iReplicationSequence\n");
    } else {
        print localtime() . " : No replication data dumped - not producing a replication packet\n";
        writefile("REPLICATION_SEQUENCE", "");
    }
} else {
    writefile("REPLICATION_SEQUENCE", "");
}

# Dump this /after/ we've possibly updated the current_replication_sequence
# Dump this table only if we're dumping everything (i.e., @tablelist is empty) or if it's explicitly requested.
dumptable("replication_control") if (scalar @tablelist == 0 || grep { $_ eq 'replication_control' } @tablelist);

# Make sure our replication data is safe before we commit its removal from the database
system "/bin/sync"; $? == 0 or die "sync failed (rc=$?)";
$sql->commit;

my $dumptime = tv_interval($t0);
printf "%s : Dumped %d tables (%d rows) in %d seconds\n",
    scalar localtime,
    $tables, $totalrows, $dumptime;

optimise_replication_tables() if $fDoReplication;

# Now we have all the files; disconnect from the database.
# This also drops the _sanitised temporary tables.
undef $sql;
undef $c;

# We can release the lock, allowing other exports to run if they wish.
close $lockfh;

use File::Copy qw( copy );

if ($fCompress and $fDoReplication)
{
    my $tarfile = "replication-$iReplicationSequence.tar.bz2";

    if (grep { $rowcounts{$_} } @replication_tables)
    {
        copy_readme() or die $!;
        # Go for the "most restrictive" license
        copy("$FindBin::Bin/COPYING-CCShareAlike", "$dir/COPYING") or die $!;
        make_tar($tarfile, @replication_tables);
    }
    if ($fCallback) {
        system $fCallback;
    }
}

if ($fCompress and $fDoFullExport)
{
    copy_readme() or die $!;
    copy("$FindBin::Bin/COPYING-PublicDomain", "$dir/COPYING") or die $!;
    make_tar("mbdump.tar.bz2", @group_core);

    copy_readme() or die $!;
    copy("$FindBin::Bin/COPYING-CCShareAlike", "$dir/COPYING") or die $!;
    make_tar("mbdump-derived.tar.bz2", @group_derived);
    make_tar("mbdump-editor.tar.bz2",  @group_editor);
    make_tar("mbdump-edit.tar.bz2",  @group_edit);
    make_tar("mbdump-stats.tar.bz2",  @group_stats);
    make_tar("mbdump-private.tar.bz2", @group_private);
    make_tar("mbdump-cdstubs.tar.bz2", @group_cdstubs);
    make_tar("mbdump-cover-art-archive.tar.bz2", @group_cover_art_archive);
    make_tar("mbdump-wikidocs.tar.bz2", @group_wikidocs);
    make_tar("mbdump-documentation.tar.bz2", @group_documentation);

    system "cd $OutputDir && /usr/bin/md5sum --binary *.tar.bz2" .
           ' | grep -v mbdump-private > MD5SUMS';

    $? == 0 or die "md5sum returned $?";

    gpg_sign("$OutputDir/MD5SUMS");

    if ($encrypt_for) {
        system "gpg --recipient '$encrypt_for' --encrypt $OutputDir/mbdump-private.tar.bz2";
        system "rm $OutputDir/mbdump-private.tar.bz2";
    }
}

# Tar files all created safely... we can erase the tmpdir on exit
system "/bin/sync"; $? == 0 or die "sync failed (rc=$?)";
$erase_tmpdir_on_exit = 1;

exit;

################################################################################

sub make_tar
{
    my ($tarfile, @files) = @_;

    @files = map { "mbdump/$_" } grep { defined $rowcounts{$_} } @files;

    # These ones go first, so MBImport can quickly find them
    unshift @files, qw(
        TIMESTAMP
        COPYING
        README
        REPLICATION_SEQUENCE
        SCHEMA_SEQUENCE
    );

    my $t0 = [gettimeofday];
    print localtime() . " : Creating $tarfile\n";
    system { "/bin/tar" } "tar",
        "-C", $dir,
        "--bzip2",
        "--create",
        "--verbose",
        "--file", "$OutputDir/$tarfile",
        "--",
        @files,
        ;
    $? == 0 or die "Tar returned $?";
    printf "%s : Tar completed in %d seconds\n", scalar localtime, tv_interval($t0);

    gpg_sign( "$OutputDir/$tarfile" ) if $sign_with;
}

sub writefile
{
    my ($file, $contents) = @_;
    open(my $fh, ">$dir/$file") or die $!;
    print $fh $contents or die $!;
    close $fh or die $!;
}

sub empty_replication_tables
{
    $sql->do("DELETE FROM dbmirror_pendingdata");
    $sql->do("DELETE FROM dbmirror_pending");
    # As soon as we commit, the replication data is gone from the DB, so we must
    # be sure that we have a safe copy.
    $erase_tmpdir_on_exit = 0;
}

sub optimise_replication_tables
{
    print localtime() . " : Optimising replication tables\n";
    $sql->auto_commit;
    $sql->do("VACUUM ANALYZE dbmirror_pendingdata");
    $sql->auto_commit;
    $sql->do("VACUUM ANALYZE dbmirror_pending");
}

sub copy_readme
{
    my $text = <<END;
The files in this directory are snapshots of the MusicBrainz database,
in a format suitable for import into a PostgreSQL database. To import
them, you need a compatible version of the MusicBrainz server software.
END
    writefile("README", $text);
}

sub parse_create_tables
{
    my ($dir, $schema, $tables) = @_;

    open FILE, "<$FindBin::Bin/../admin/sql/$dir/CreateTables.sql";
    my $create_tables_sql = do { local $/; <FILE> };
    close FILE;
    while ($create_tables_sql =~ m/CREATE TABLE\s+([a-z0-9_]+)\s+\(\s*(.*?)\s*\);/gsi) {
        my $table = $schema ? "$schema.$1" : $1;
        push @$tables, $table;
    }
}

sub check_tables_completeness
{
    my @create_tables;

    parse_create_tables('', '', \@create_tables);
    parse_create_tables('caa', 'cover_art_archive', \@create_tables);
    parse_create_tables('documentation', 'documentation', \@create_tables);
    parse_create_tables('wikidocs', 'wikidocs', \@create_tables);
    parse_create_tables('statistics', 'statistics', \@create_tables);

    my %exported_tables = map { $_ => 1 } @tables;
    my @not_exported_tables = grep { not exists $exported_tables{$_} } @create_tables;
    printf "Table not exported: %s\n", $_
        for sort @not_exported_tables;

    my @no_group_tables;
    foreach my $table (@tables) {
        my $included = 0;
        foreach my $group (@groups) {
            if (grep {$_ eq $table} @$group) {
                $included = 1;
            }
        }
        push @no_group_tables, $table if not $included;
    }
    printf "Table exported but not included in any tar: %s\n", $_
        for sort @no_group_tables;
}

sub gpg_sign
{
    my ($file_to_be_signed) = @_;

    # Skip GPG signing of a file if no "sign with" key is given
    return unless $sign_with;

    # Sign requested file
    system "gpg",
        "--default-key", "$sign_with",
        "--detach-sign",
        "--armor",
        $file_to_be_signed;

    # Inform user about a failed signature process
    if ( $? != 0 ) {
        printf STDERR "Failed to sign $file_to_be_signed \n"
            . "GPG returned $?";
    }
}

# eof ExportAllTables
