612 lines
17 KiB
Perl
Executable File
612 lines
17 KiB
Perl
Executable File
#!/usr/bin/perl
|
|
# -*- Mode: Perl -*-
|
|
# dirsplit ---
|
|
# Author : Eduard Bloch ( blade@debian.org )
|
|
# Last Modified On : Sun, 06 Feb 2005 14:59:51 +0100
|
|
# Status : Working, but use with caution!
|
|
# License: GPLv2
|
|
|
|
my $version="0.3.3";
|
|
|
|
require v5.8.1;
|
|
use strict;
|
|
use List::Util 'shuffle';
|
|
use Getopt::Long qw(:config no_ignore_case bundling);
|
|
use File::Basename;
|
|
use File::Path;
|
|
use Cwd 'abs_path';
|
|
|
|
my $ret=0;
|
|
my $max="4488M";
|
|
my $prefix="vol_";
|
|
my $acc=20;
|
|
my $emode=1;
|
|
my $bsize=2048;
|
|
my $ofac =50;
|
|
my $opt_help;
|
|
my $opt_longhelp;
|
|
my $opt_sim;
|
|
my $opt_dir;
|
|
my $opt_flat;
|
|
my $opt_move;
|
|
my $opt_ver;
|
|
my $opt_sln;
|
|
my $opt_ln;
|
|
my $opt_filter;
|
|
my $opt_simple;
|
|
my $opt_follow;
|
|
my $get_ver;
|
|
my $opt_listfile;
|
|
|
|
|
|
my %options = (
|
|
"h|help" => \$opt_help,
|
|
"d|dirhier" => \$opt_dir,
|
|
"flat" => \$opt_flat,
|
|
"f|filter=s" => \$opt_filter,
|
|
"F|follow" => \$opt_follow,
|
|
"e|expmode=i" => \$emode,
|
|
"o|overhead=i" => \$ofac,
|
|
"b|blksize=i" => \$bsize,
|
|
"n|no-act" => \$opt_sim,
|
|
"m|move" => \$opt_move,
|
|
"l|symlink" => \$opt_sln,
|
|
"L|hardlink" => \$opt_ln,
|
|
"v|verbose" => \$opt_ver,
|
|
"s|size=s" => \$max,
|
|
"S|simple" => \$opt_simple,
|
|
"T|input=s" => \$opt_listfile,
|
|
"p|prefix=s" => \$prefix,
|
|
"a|accuracy=i" => \$acc,
|
|
"H|longhelp" => \$opt_longhelp,
|
|
"version" => \$get_ver
|
|
);
|
|
|
|
&show_help(1) unless ( GetOptions(%options));
|
|
&show_help(1) if $opt_help;
|
|
&show_longhelp if $opt_longhelp;
|
|
if($get_ver) {
|
|
print $version;
|
|
exit 0;
|
|
}
|
|
|
|
# ignore the old dirhier setting since it is default now and disable the flag when opt_flat is specified
|
|
$opt_dir = !$opt_flat;
|
|
|
|
$opt_ver = 1 if $opt_sim;
|
|
$opt_move=1 if ($opt_sln || $opt_ln);
|
|
|
|
# big list @sizes containing the "items" (object sizes)
|
|
# %names hash mapping "items" (size as key) to arrays with filenames/subarrays for coalesced files
|
|
my @sizes;
|
|
my %names;
|
|
|
|
# result containts the calculated output. In simple mode, an
|
|
# array (bins) of atoms (files or filelists). Otherwise, sizes
|
|
# instead of atoms, to be resolved with %names.
|
|
my @result;
|
|
|
|
my $inputdir;
|
|
|
|
$max=fixnr($max);
|
|
# about 400kB for iso headers
|
|
$max-=420000;
|
|
|
|
# init default value
|
|
my $globwaste=0;
|
|
|
|
|
|
if(-d $ARGV[0] || (-d readlink($ARGV[0]))) {
|
|
syswrite(STDOUT,"Building file list, please wait...\n");
|
|
# save the absolut path before doing anyhting
|
|
$inputdir=Cwd::abs_path($ARGV[0]);
|
|
&explore($inputdir);
|
|
}
|
|
elsif($opt_listfile) {
|
|
if($opt_listfile eq "-") {
|
|
&parseListe(\*STDIN);
|
|
}
|
|
else {
|
|
open(my $in, "<", $opt_listfile) || die "Cannot open list file $opt_listfile\n";
|
|
&parseListe($in);
|
|
}
|
|
}
|
|
else {
|
|
die "Error: please specify a directory\n";
|
|
}
|
|
|
|
# check for pointless requests
|
|
my $testsize=0;
|
|
for(@sizes) {
|
|
die "Too large object(s) ($_) for the given max size: @{$names{$_}} (maybe coalesced in arrays, check manually)\n" if($_>$max);
|
|
|
|
$testsize+=$_;
|
|
}
|
|
|
|
$acc=1 if ($testsize <= $max); # just generate a list, more trials are pointless
|
|
print "\nSumm: $testsize\n" if($opt_ver);
|
|
die "Nothing to do!\n" if($testsize<4096); # looks like just an empty dir
|
|
|
|
if(!$opt_simple) {
|
|
syswrite(STDOUT, "Calculating, please wait...\n");
|
|
my $starttime=time;
|
|
$globwaste=$max*@sizes;
|
|
for(1..$acc) {
|
|
syswrite(STDOUT,".");
|
|
my @tmp;
|
|
#my $waste = bp_bestfit($max, \@in, \@tmp);
|
|
my $waste = bp_firstfit($max, \@sizes, \@tmp);
|
|
#print "D: waste - $waste\n";
|
|
if($waste < $globwaste) {
|
|
$globwaste=$waste;
|
|
@result=@tmp;
|
|
}
|
|
if($starttime && time > $starttime+10) {
|
|
syswrite(STDOUT,"\nSpent already over 10s (for $_ iterations)\nHint: reduce accuracy to make it faster!\n");
|
|
undef $starttime;
|
|
}
|
|
@sizes=shuffle(@sizes);
|
|
}
|
|
|
|
}
|
|
|
|
print "\nCalculated, using ".(scalar @result)." volumes.\n";
|
|
print "Wasted: $globwaste Byte (estimated, check mkisofs -print-size ...)\n";
|
|
|
|
# and the real work
|
|
my $i=0;
|
|
my $inDirLen=length($inputdir);
|
|
for(@result) {
|
|
$i++;
|
|
my $o;
|
|
open($o, ">$prefix$i.list") if(! ($opt_move || $opt_sim));
|
|
my $dirPrefix=dirname($prefix);
|
|
my $prefixBase=basename($prefix);
|
|
my $dirPrefixAbs=Cwd::abs_path($dirPrefix);
|
|
|
|
for(@{$_}) {
|
|
my $stuffRef;
|
|
|
|
# For simple mode, the files/atoms are already resolved, otherwise take
|
|
# the next with appropriate size.
|
|
my $item= $opt_simple ? $_ : shift(@{$names{$_}});
|
|
|
|
# make reference point to an array with our files, create a list if needed
|
|
if(ref($item) eq "ARRAY") {
|
|
$stuffRef=$item;
|
|
}
|
|
else {
|
|
$stuffRef=[$item];
|
|
}
|
|
|
|
for my $file (@$stuffRef) {
|
|
my $relFile=substr($file,$inDirLen+1);
|
|
my $base=basename($relFile);
|
|
if($opt_move) {
|
|
my $targetsubdir = $dirPrefixAbs."/$prefixBase$i";
|
|
$targetsubdir .= "/".dirname($relFile) if($opt_dir);
|
|
print "$file -> $targetsubdir/$base\n" if($opt_ver);
|
|
if(!$opt_sim) {
|
|
mkpath $targetsubdir || die "Problems creating $targetsubdir\n";
|
|
# last check
|
|
die "Could not create $targetsubdir?\n" if(!(-d $targetsubdir && -w $targetsubdir));
|
|
if($opt_sln) {
|
|
symlink($file, "$targetsubdir/$base");
|
|
}
|
|
elsif($opt_ln) {
|
|
if(-d $file && !-l $file) {
|
|
mkdir "$targetsubdir/$base";
|
|
}
|
|
else {
|
|
link($file, "$targetsubdir/$base");
|
|
}
|
|
}
|
|
else {
|
|
rename($file, "$targetsubdir/$base");
|
|
}
|
|
}
|
|
}
|
|
else {
|
|
# escape = in mkisofs catalogs, they are used as separator
|
|
my $isoname = ($opt_dir?$relFile : $base);
|
|
$isoname=~s/=/\\=/g;
|
|
my $sourcefile=$file;
|
|
$sourcefile=~s/=/\\=/g;
|
|
print "$i: /$isoname=$sourcefile\n" if $opt_ver;
|
|
print $o "/$isoname=$sourcefile\n" if(!$opt_sim);
|
|
}
|
|
}
|
|
}
|
|
close($o) if($o);
|
|
}
|
|
|
|
exit $ret;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# recursive function
|
|
# parameter: directory
|
|
# mode 1: descend as far as possible and index all non-directories
|
|
# mode 2++:
|
|
# put all files of a dir into coaleseced-object, then descend into each dir
|
|
sub explore {
|
|
(my $dir) = @_;
|
|
my @stuff;
|
|
my @dirs;
|
|
my @files;
|
|
|
|
opendir(DIR, $dir) || die "Could not open $dir\n";
|
|
@stuff=readdir(DIR);
|
|
|
|
if($opt_simple) {
|
|
@stuff=sort { lc($a) cmp lc($b) } @stuff;
|
|
}
|
|
|
|
foreach my $f (@stuff) {
|
|
next if ($f eq "." || $f eq "..");
|
|
#print "\$f=$opt_filter;\n";
|
|
|
|
$f="$dir/$f" if($dir ne ".");
|
|
|
|
if ($opt_filter) {
|
|
next unless (eval("\$f=~$opt_filter;"));
|
|
}
|
|
|
|
if(-l $f && ! $opt_follow) {
|
|
push(@files, $f);
|
|
}
|
|
elsif(-d $f) {
|
|
push(@dirs, $f);
|
|
}
|
|
else {
|
|
push(@files, $f);
|
|
}
|
|
}
|
|
closedir(DIR);
|
|
|
|
if( (@dirs + @files) == 0 ) {
|
|
# this one is empty, register for cosmetics reason
|
|
&insitem(getsize($dir), $dir);
|
|
return;
|
|
}
|
|
|
|
# recurse on directories
|
|
&explore($_) for(@dirs);
|
|
|
|
# and now process files
|
|
if($emode==1) {
|
|
&insitem(getsize($_), $_) for(@files);
|
|
}
|
|
else {
|
|
# handle coalesced objects - first some sanity checks and splitting if
|
|
# required
|
|
|
|
my $filesum=0;
|
|
for(@files) {
|
|
my $tmp=getsize($_);
|
|
if($tmp>$max) {
|
|
# already too large, stop right here
|
|
die "Too large file ($_) for the given max size $max, aborting...\n";
|
|
}
|
|
$filesum += $tmp;
|
|
};
|
|
|
|
# handle coal. objects becoming too large
|
|
if($filesum>$max) {
|
|
# too large coal. object...
|
|
if($emode==3) {
|
|
# don't coalesc in this mode, do like mode 1 above, leave them alone
|
|
&insitem(getsize($_), $_) for(@files);
|
|
return;
|
|
}
|
|
# a bit complicated, split file set while creating coal.objects
|
|
if($emode==4) {
|
|
my $partsum=0;
|
|
my @sorted=sort(@files);
|
|
my @tmpvol;
|
|
for(my $i=0;$i<=$#sorted;$i++) {
|
|
# print "D: i: $i, partsum: $partsum, file: $sorted[$i]\n";
|
|
my $tmp=getsize($sorted[$i]);
|
|
$partsum+=$tmp;
|
|
if($partsum>$max) {
|
|
# undo the last step then build the coal.object
|
|
$partsum-=$tmp;
|
|
$i--;
|
|
|
|
&insitem($partsum, \@tmpvol);
|
|
# reset temporaries
|
|
undef @tmpvol;
|
|
undef $partsum;
|
|
}
|
|
else {
|
|
push(@tmpvol, $sorted[$i]);
|
|
}
|
|
}
|
|
return;
|
|
}
|
|
}
|
|
|
|
# ok, building a coalesced object for simple cases
|
|
if($filesum) {
|
|
&insitem($filesum, \@files);
|
|
}
|
|
}
|
|
}
|
|
|
|
my $simplePos=0;
|
|
my @simpleBinSizes;
|
|
|
|
# args: size, object (filename or list reference)
|
|
sub insitem {
|
|
my ($size, $object) = @_;
|
|
# normaly, put the items into the pool for calculation. In simple mode, calculate here
|
|
|
|
push(@sizes, $size);
|
|
push(@{$names{$size}},$object);
|
|
|
|
if($opt_simple) {
|
|
# now the simplest method to fill the bins, just take a new one when the
|
|
# object-to-be-added no longer fits
|
|
if($simpleBinSizes[$simplePos]+$size > $max) {
|
|
$globwaste += ( $max-$simpleBinSizes[$simplePos] );
|
|
$simplePos++;
|
|
};
|
|
$simpleBinSizes[$simplePos]+=$size;
|
|
push( @{$result[$simplePos]}, $object);
|
|
}
|
|
|
|
}
|
|
|
|
sub getsize {
|
|
(my $file) = @_;
|
|
my $size = ((stat($file))[7]);
|
|
my $rest = ($size % $bsize);
|
|
$size = ($size + $bsize - $rest) if ($rest);
|
|
return 1+int(200 + $ofac*length(basename($file)) + $size);
|
|
}
|
|
|
|
sub parseListe {
|
|
my $fh=${$_[0]};
|
|
while(<$fh>) {
|
|
if(/^(\w+)\s+(.+)/) {
|
|
&insitem(fixnr($1), $2);
|
|
}
|
|
}
|
|
}
|
|
|
|
sub fixnr {
|
|
# args:
|
|
# Number
|
|
# optional: default multiplier
|
|
my $fac;
|
|
my $nr;
|
|
if($_[0]=~/(\d+)(\D)/) {
|
|
$nr=$1;
|
|
$fac=$2;
|
|
}
|
|
elsif(defined($_[1])) {
|
|
$nr=$_[0];
|
|
$fac=$_[1];
|
|
}
|
|
else {
|
|
return $_[0];
|
|
}
|
|
return $nr*1000000000 if($fac eq "g");
|
|
return $nr*1073741824 if($fac eq "G");
|
|
return $nr*1000000 if($fac eq "m");
|
|
return $nr*1048576 if($fac eq "M");
|
|
return $nr*1000 if($fac eq "k");
|
|
return $nr*1024 if($fac eq "K");
|
|
return $nr if($fac eq "b");
|
|
die "$fac is not a valid multiplier!";
|
|
}
|
|
|
|
|
|
sub show_help {
|
|
print <<EOM
|
|
dirsplit [options] [advanced options] < directory >
|
|
|
|
-H|--longhelp Show the long help message with more advanced options
|
|
-n|--no-act Only print the commands, no action (implies -v)
|
|
-s|--size NUMBER - Size of the medium (default: $max)
|
|
-e|--expmode NUMBER - directory exploration mode (recommended, see long help)
|
|
-m|--move Move files to target dirs (default: create mkisofs catalogs)
|
|
-p|--prefix STRING - first part of catalog/directory name (default: vol_)
|
|
-h|--help Show this option summary
|
|
-v|--verbose More verbosity
|
|
|
|
The complete help can be displayed with the --longhelp (-H) option.
|
|
The default mode is creating file catalogs useable with:
|
|
mkisofs -D -r --joliet-long -graft-points -path-list CATALOG
|
|
|
|
Example:
|
|
dirsplit -m -s 700M -e2 random_data_to_backup/
|
|
EOM
|
|
;
|
|
exit shift;
|
|
}
|
|
|
|
sub show_longhelp {
|
|
my $msglong="
|
|
dirsplit [options] [advanced options] < directory >
|
|
-n|--no-act Only print the commands, no action (implies -v)
|
|
-s|--size NUMBER - Size of the medium (default: $max)
|
|
-m|--move Move files to target dirs (default: create mkisofs catalogs)
|
|
-l|--symlink similar to -m but just creates symlinks in the target dirs
|
|
-L|--hardlink like -l but creates hardlinks
|
|
-p|--prefix STRING - first part of catalog/directory name (default: vol_)
|
|
-f|--filter EXPR - Filter expression, see examples below and perlre manpage
|
|
--flat Flat dir mode, don't recreate subdirectory structure (not recommended)
|
|
-e|--expmode NUMBER, special exploration modes, used with directory argument
|
|
|
|
1: (default) native exploration of the specified directory, but file sizes
|
|
are rounded up to 2048 blocks plus estimated overhead for
|
|
filenames (see -o option)
|
|
2: like 1, but all files in directory are put together (as \"atom\") onto the
|
|
same medium. This does not apply to subdirectories, however.
|
|
3: like 2, but don't coalesc files when the size of the \"atom\" becomes too
|
|
large for the medium size (currently $max)
|
|
4: like 2, but the max. size of the atoms is limited to $max (storing the
|
|
rest on another medium)
|
|
|
|
-F|--follow Follow symlinks. Use with care!
|
|
-b|--blksize NUMBER, block size of the target filesystem (currently $bsize).
|
|
-o|--overhead NUMBER, overhead caused by directory entries (as factor for the
|
|
filename length, default: 50, empiricaly found for Joliet+RR
|
|
with not-so-deep directory structure). Works in exploration
|
|
mode.
|
|
-a|--accuracy NUMBER (1=faster, large number=better efficiency, default: 500)
|
|
-S|--simple Simple/stupid/alphabetic mode
|
|
-T|--input FILENAME (or - for STDIN): List with sizes and paths, try:
|
|
find dir -type f -printf \"%s %p\n\"
|
|
to get an example. Avoid duplicates! Unit suffixes are allowed.
|
|
-h|--help Show this option summary
|
|
-v|--verbose More verbosity
|
|
|
|
File sizes are expected to be in bytes, append modifier letters to multiply
|
|
with a factor, eg 200M (b,k,K,m,M,g,G for Bytes, Kb, KiB, Mb, MiB, Gb, GiB).
|
|
The default output mode is creating file catalogs useable with
|
|
mkisofs -D -r --joliet-long -graft-points -path-list CATALOG
|
|
|
|
Examples:
|
|
dirsplit -m -s 120M -e4 largedirwithdata/ -p /zipmedia/backup_ #move stuff into splitted backup dirs
|
|
dirsplit -s 700M -e2 music/ # make mkisofs catalogs to burn all music to 700M CDRs, keep single files in each dir together
|
|
dirsplit -s 700M -e2 -f '/other\\/Soundtracks/' music/ # like above, only take files from other/Soundtracks
|
|
dirsplit -s 700M -e2 -f '!/Thumbs.db|Desktop.ini|\\.m3u\$/i' # like above, ignore some junk files and playlists, both letter cases
|
|
|
|
Bugs: overhead trough blocksize alignment and directory entry storage varies,
|
|
heavily depends on the target filesystem and configuration (see -b and -o).
|
|
|
|
You should compare the required size of the created catalogs, eg.:
|
|
for x in *list ; do mkisofs -quiet -D -r --joliet-long -graft-points \\
|
|
-path-list \$x -print-size; done
|
|
(output in blocks of 2048 bytes) with the expected size (-s) and media data
|
|
(cdrecord -v -toc ...).
|
|
";
|
|
print $msglong;
|
|
exit 0;
|
|
}
|
|
|
|
# Parms: bin size (int), input array (arr reference), output array (arr reference)
|
|
# Returns: wasted space (int)
|
|
sub bp_bestfit {
|
|
my $max=$_[0];
|
|
my @in = @{$_[1]};
|
|
my $target = $_[2];
|
|
my @out;
|
|
my @bel;
|
|
|
|
my @tmp;
|
|
push(@tmp,$in[0]);
|
|
push(@out, \@tmp);
|
|
$bel[0] = $in[0];
|
|
shift @in;
|
|
|
|
for(@in) {
|
|
my $bestplace=$#out+1;
|
|
my $bestwert=$max;
|
|
for($i=0;$i<=$#out;$i++) {
|
|
my $rest;
|
|
$rest=$max-$bel[$i]-$_;
|
|
if($rest>0 && $rest < $bestwert) {
|
|
$bestplace=$i;
|
|
$bestwert=$rest;
|
|
};
|
|
}
|
|
if($bestplace>$#out) {
|
|
my @bin;
|
|
$bel[$bestplace]=$_;
|
|
push(@bin, $_);
|
|
push(@out,\@bin);
|
|
}
|
|
else{
|
|
$bel[$bestplace]+=$_;
|
|
push( @{$out[$bestplace]} , $_);
|
|
}
|
|
}
|
|
my $ret=0;
|
|
# count all rests but the last one
|
|
for($i=0;$i<$#out;$i++) {
|
|
$ret+=($max-$bel[$i]);
|
|
}
|
|
@{$target} = @out;
|
|
return $ret;
|
|
}
|
|
|
|
# Parms: bin size (int), input array (arr reference), output array (arr reference)
|
|
# Returns: wasted space (int)
|
|
sub bp_firstfit {
|
|
my $max=$_[0];
|
|
my @in = @{$_[1]};
|
|
my $target = $_[2];
|
|
my @out;
|
|
my @bel;
|
|
|
|
piece: foreach my $obj (@in) {
|
|
# first fit, use the first bin with enough free space
|
|
# print "F: bin$i: $obj, @{$names{$obj}}\n";
|
|
for($i=0;$i<=$#out;$i++) {
|
|
my $newsize=($bel[$i]+$obj);
|
|
# print "bel[i]: $bel[$i], new?: $newsize to max: $max\n";
|
|
if( $newsize <= $max ) {
|
|
# print "F: bin$i: $bel[$i]+$obj=$newsize\n";
|
|
#fits here
|
|
$bel[$i]=$newsize;
|
|
push( @{$out[$i]} , $obj);
|
|
next piece; # break
|
|
}
|
|
}
|
|
# neues Bin
|
|
my @bin;
|
|
$bel[$i]=$obj;
|
|
# print "N: bin$i: $bel[$i]=$obj\n";
|
|
push(@bin, $obj);
|
|
push(@out,\@bin);
|
|
}
|
|
my $ret=0;
|
|
# sum up all rests except of the one from the last bin
|
|
for($i=0;$i<$#out;$i++) {
|
|
# print "hm, bel $i ist :".$bel[$i]." und res:".($max-$bel[$i])."\n";
|
|
$ret+=($max-$bel[$i]);
|
|
}
|
|
@{$target} = @out;
|
|
# print "wtf, ".join(",", @{$out[0]})."\n";
|
|
return $ret;
|
|
}
|