ShuffleChains

Bash

GNU GPL

Sorts a list of files into an order that minimizes variance in file size between sequential subsets

Download (right click, save as, rename as appropriate)

Embed

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
#!/bin/bash

dataset=$1
#inChainFile="LX--bcbspi-Jpsi-stn-80--kreps-nnsel-${dataset}--v2.src";
inChainFile="${dataset}.src";
outChainFile="${inChainFile}--SHUFFLE";

#Calculate average file size and build temporary file
tmpFile="${inChainFile}--TMP";
rm -f ${tmpFile}
rm -f ${outChainFile}
tot=0;
a=1;
b=`grep -c '' ${inChainFile}`;
while [ $a -le $b ]; do
        filename=`head -${a} ${inChainFile} | tail -1  | grep -o '/[[:alnum:]\._/-]*.root'`;
        if [ -e ${filename} ]; then
                filesize=`ls -lA ${filename} | awk -F" " ' { print $5 } '`;
                let tot+=${filesize};
                echo "${filesize} ${filename}" >> ${tmpFile}
        fi;
        let a+=1;
done;
let avg=${tot}/${b};

#Sort the tmp file
cat ${tmpFile} | sort -nr > ${tmpFile}_2
mv ${tmpFile}_2 ${tmpFile}

#Now beginning appending lines, trying to stay near the average
let TopLines=1;
let BottomLines=1;
let runningtotal=0;
let runningcount=0;
let runningaverage=0;

let belowaverage=1;
while [ ${runningcount} -lt ${b} ]; do
        if [ ${belowaverage} -eq 1 ]; then
                file=`head -${TopLines} ${tmpFile} | tail -1 | grep -o '/[[:alnum:]\._/-]*.root' `;
                filesize=`head -${TopLines} ${tmpFile} | tail -1 | awk -F" " ' { print $1 } '`;
                echo "myChain->Add(\"${file}\",0);" >> ${outChainFile}
                let TopLines+=1;
                let runningtotal+=${filesize};
                let runningcount+=1;
        else
                file=`tail -${BottomLines} ${tmpFile} | head -1 | grep -o '/[[:alnum:]\._/-]*.root' `;
                filesize=`tail -${BottomLines} ${tmpFile} | head -1 | awk -F" " ' { print $1 } '`;
                echo "myChain->Add(\"${file}\",0);" >> ${outChainFile}
                let BottomLines+=1;
                let runningtotal+=${filesize};
                let runningcount+=1;
        fi
        let runningaverage=${runningtotal}/${runningcount};
        if [ ${runningaverage} -lt ${avg} ]; then
                let belowaverage=1;
        else
                let belowaverage=0;
        fi;
done;

rm -f ${tmpFile}
mv ${outChainFile} ${inChainFile}


exit