149 #include <unordered_map> 153 using std::make_pair;
159 const
Int_t TMVA::MethodBDT::fgDebugLevel = 0;
171 , fSigToBkgFraction(0)
176 , fBaggedGradBoost(kFALSE)
180 , fMinNodeSizeS("5%")
183 , fMinLinCorrForFisher(.8)
184 , fUseExclusiveVars(0)
185 , fUseYesNoLeaf(kFALSE)
186 , fNodePurityLimit(0)
191 , fFValidationEvents(0)
193 , fRandomisedTrees(kFALSE)
195 , fUsePoissonNvars(0)
196 , fUseNTrainEvents(0)
197 , fBaggedSampleFraction(0)
198 , fNoNegWeightsInTraining(kFALSE)
199 , fInverseBoostNegWeights(kFALSE)
200 , fPairNegWeightsGlobal(kFALSE)
201 , fTrainWithNegWeights(kFALSE)
202 , fDoBoostMonitor(kFALSE)
210 , fDoPreselection(kFALSE)
211 , fSkipNormalization(kFALSE)
212 , fHistoricBool(kFALSE)
214 fMonitorNtuple =
NULL;
216 fRegressionLossFunctionBDTG =
nullptr;
226 , fSigToBkgFraction(0)
231 , fBaggedGradBoost(
kFALSE)
235 , fMinNodeSizeS(
"5%")
238 , fMinLinCorrForFisher(.8)
239 , fUseExclusiveVars(0)
241 , fNodePurityLimit(0)
246 , fFValidationEvents(0)
248 , fRandomisedTrees(
kFALSE)
250 , fUsePoissonNvars(0)
251 , fUseNTrainEvents(0)
252 , fBaggedSampleFraction(0)
253 , fNoNegWeightsInTraining(
kFALSE)
254 , fInverseBoostNegWeights(
kFALSE)
255 , fPairNegWeightsGlobal(
kFALSE)
256 , fTrainWithNegWeights(
kFALSE)
266 , fSkipNormalization(
kFALSE)
344 DeclareOptionRef(
fMinNodeSizeS=tmp,
"MinNodeSize",
"Minimum percentage of training events required in a leaf node (default: Classification: 5%, Regression: 0.2%)");
346 DeclareOptionRef(
fNCuts,
"nCuts",
"Number of grid points in variable range used in finding optimal cut in node splitting");
370 DeclareOptionRef(
fRandomisedTrees,
"UseRandomisedTrees",
"Determine at each node splitting the cut variable only as the best out of a random subset of variables (like in RandomForests)");
372 DeclareOptionRef(
fUsePoissonNvars,
"UsePoissonNvars",
"Interpret \"UseNvars\" not as fixed number but as mean of a Poisson distribution in each split with RandomisedTree option");
373 DeclareOptionRef(
fBaggedSampleFraction=.6,
"BaggedSampleFraction",
"Relative size of bagged event sample to original size of the data sample (used whenever bagging is used (i.e. UseBaggedBoost, Bagging,)" );
376 "Use Sig or Bkg categories, or the purity=S/(S+B) as classification of the leaf node -> Real-AdaBoost");
381 DeclareOptionRef(
fNegWeightTreatment=
"InverseBoostNegWeights",
"NegWeightTreatment",
"How to treat events with negative weights in the BDT training (particular the boosting) : IgnoreInTraining; Boost With inverse boostweight; Pair events with negative and positive weights in training sample and *annihilate* them (experimental!)");
416 DeclareOptionRef(
fHuberQuantile = 0.7,
"HuberQuantile",
"In the Huber loss function this is the quantile that separates the core from the tails in the residuals distribution.");
430 DeclareOptionRef(
fPruneMethodS,
"PruneMethod",
"Note: for BDTs use small trees (e.g.MaxDepth=3) and NoPruning: Pruning: Method used for pruning (removal) of statistically insignificant branches ");
444 DeclareOptionRef(
fBaggedGradBoost=
kFALSE,
"UseBaggedGrad",
"deprecated: Use *UseBaggedBoost* instead: Use only a random subsample of all events for growing the trees in each iteration.");
445 DeclareOptionRef(
fBaggedSampleFraction,
"GradBaggingFraction",
"deprecated: Use *BaggedSampleFraction* instead: Defines the fraction of events to be used in each iteration, e.g. when UseBaggedGrad=kTRUE. ");
446 DeclareOptionRef(
fUseNTrainEvents,
"UseNTrainEvents",
"deprecated: Use *BaggedSampleFraction* instead: Number of randomly picked training events used in randomised (and bagged) trees");
460 "Use weighted trees or simple average in classification from the forest");
482 Log() << kFATAL <<
"<ProcessOptions> unknown Separation Index option " <<
fSepTypeS <<
" called" <<
Endl;
487 Log() << kFATAL <<
"<ProcessOptions> Huber Quantile must be in range [0,1]. Value given, " <<
fHuberQuantile <<
", does not match this criteria" <<
Endl;
505 Log() << kFATAL <<
"<ProcessOptions> unknown PruneMethod " <<
fPruneMethodS <<
" option called" <<
Endl;
511 <<
"Sorry automatic pruning strength determination is not implemented yet for ExpectedErrorPruning" <<
Endl;
517 Log() << kWARNING <<
"You have explicitly set ** nEventsMin = " <<
fMinNodeEvents<<
" ** the min absolute number \n" 518 <<
"of events in a leaf node. This is DEPRECATED, please use the option \n" 519 <<
"*MinNodeSize* giving the relative number as percentage of training \n" 520 <<
"events instead. \n" 523 Log() << kWARNING <<
"Note also that explicitly setting *nEventsMin* so far OVERWRITES the option recommended \n" 537 Log() << kINFO <<
"the option *InverseBoostNegWeights* does not exist for BoostType=Grad --> change" <<
Endl;
538 Log() << kINFO <<
"to new default for GradBoost *Pray*" <<
Endl;
539 Log() << kDEBUG <<
"i.e. simply keep them as if which should work fine for Grad Boost" <<
Endl;
552 Log() << kWARNING <<
"You have chosen to use more than half of your training sample " 553 <<
"to optimize the automatic pruning algorithm. This is probably wasteful " 554 <<
"and your overall results will be degraded. Are you sure you want this?" 559 if (this->
Data()->HasNegativeEventWeights()){
560 Log() << kINFO <<
" You are using a Monte Carlo that has also negative weights. " 561 <<
"That should in principle be fine as long as on average you end up with " 562 <<
"something positive. For this you have to make sure that the minimal number " 563 <<
"of (un-weighted) events demanded for a tree node (currently you use: MinNodeSize=" 565 <<
", (or the deprecated equivalent nEventsMin) you can set this via the " 566 <<
"BDT option string when booking the " 567 <<
"classifier) is large enough to allow for reasonable averaging!!! " 568 <<
" If this does not help.. maybe you want to try the option: IgnoreNegWeightsInTraining " 569 <<
"which ignores events with negative weight in the training. " <<
Endl 570 <<
Endl <<
"Note: You'll get a WARNING message during the training if that should ever happen" <<
Endl;
575 Log() << kWARNING <<
"Regression Trees do not work with fUseYesNoLeaf=TRUE --> I will set it to FALSE" <<
Endl;
580 Log() << kWARNING <<
"Regression Trees do not work with Separation type other than <RegressionVariance> --> I will use it instead" <<
Endl;
584 Log() << kWARNING <<
"Sorry, UseFisherCuts is not available for regression analysis, I will ignore it!" <<
Endl;
588 Log() << kWARNING <<
"Sorry, the option of nCuts<0 using a more elaborate node splitting algorithm " <<
Endl;
589 Log() << kWARNING <<
"is not implemented for regression analysis ! " <<
Endl;
590 Log() << kWARNING <<
"--> I switch do default nCuts = 20 and use standard node splitting"<<
Endl;
595 Log() << kINFO <<
" Randomised trees use no pruning" <<
Endl;
601 Log() << kWARNING <<
"When using the option UseFisherCuts, the other option nCuts<0 (i.e. using" <<
Endl;
602 Log() <<
" a more elaborate node splitting algorithm) is not implemented. " <<
Endl;
609 Log() << kERROR <<
" Zero Decision Trees demanded... that does not work !! " 610 <<
" I set it to 1 .. just so that the program does not crash" 623 Log() << kFATAL <<
"<ProcessOptions> unknown option for treating negative event weights during training " <<
fNegWeightTreatment <<
" requested" <<
Endl;
627 Log() << kWARNING <<
" you specified the option NegWeightTreatment=PairNegWeightsGlobal : This option is still considered EXPERIMENTAL !! " <<
Endl;
638 Log() << kWARNING <<
"You have specified a deprecated option *NNodesMax="<<
fNNodesMax 639 <<
"* \n this has been translated to MaxDepth="<<
fMaxDepth<<
Endl;
645 Log() << kWARNING <<
"You have specified a deprecated option *UseNTrainEvents="<<
fUseNTrainEvents 652 Log() << kWARNING <<
"You have specified a deprecated option *UseBaggedGrad* --> please use *UseBaggedBoost* instead" <<
Endl;
660 if (sizeInPercent > 0 && sizeInPercent < 50){
664 Log() << kFATAL <<
"you have demanded a minimal node size of " 665 << sizeInPercent <<
"% of the training events.. \n" 666 <<
" that somehow does not make sense "<<
Endl;
678 Log() << kFATAL <<
"I had problems reading the option MinNodeEvents, which " 679 <<
"after removing a possible % sign now reads " << sizeInPercent <<
Endl;
742 Log() << kDEBUG <<
" successfully(?) reset the method " <<
Endl;
771 std::vector<const TMVA::Event*> tmpEventSample;
772 for (
Long64_t ievt=0; ievt<nevents; ievt++) {
775 tmpEventSample.push_back(event);
781 for (
UInt_t i=0; i<tmpEventSample.size(); i++)
delete tmpEventSample[i];
786 for (
Long64_t ievt=0; ievt<nevents; ievt++) {
798 if (firstNegWeight) {
799 Log() << kWARNING <<
" Note, you have events with negative event weight in the sample, but you've chosen to ignore them" <<
Endl;
803 }
else if (event->GetWeight()==0){
804 if (firstZeroWeight) {
806 Log() <<
"Events with weight == 0 are going to be simply ignored " <<
Endl;
810 if (event->GetWeight() < 0) {
815 Log() << kWARNING <<
"Events with negative event weights are found and " 816 <<
" will be removed prior to the actual BDT training by global " 817 <<
" paring (and subsequent annihilation) with positiv weight events" 820 Log() << kWARNING <<
"Events with negative event weights are USED during " 821 <<
"the BDT training. This might cause problems with small node sizes " 822 <<
"or with the boosting. Please remove negative events from training " 823 <<
"using the option *IgnoreEventsWithNegWeightsInTraining* in case you " 824 <<
"observe problems with the boosting" 832 Int_t imodulo =
static_cast<Int_t>( fmod(modulo,1.0) > 0.5 ?
ceil(modulo) :
floor(modulo) );
843 Log() << kINFO <<
"<InitEventSample> Internally I use " <<
fEventSample.size()
846 <<
"% of training used for validation)" << Endl;
854 Log() << kDEBUG <<
"\t<InitEventSample> For classification trees, "<<
Endl;
855 Log() << kDEBUG <<
" \tthe effective number of backgrounds is scaled to match "<<
Endl;
856 Log() << kDEBUG <<
" \tthe signal. Otherwise the first boosting step would do 'just that'!"<<
Endl;
872 Int_t sumSig=0, sumBkg=0;
882 if (sumSigW && sumBkgW){
885 Log() << kDEBUG <<
"\tre-normalise events such that Sig and Bkg have respective sum of weights = " 887 Log() << kDEBUG <<
" \tsig->sig*"<<normSig <<
"ev. bkg->bkg*"<<normBkg <<
"ev." <<
Endl;
888 Log() << kHEADER <<
"#events: (reweighted) sig: "<< sumSigW*normSig <<
" bkg: " << sumBkgW*normBkg <<
Endl;
889 Log() << kINFO <<
"#events: (unweighted) sig: "<< sumSig <<
" bkg: " << sumBkg <<
Endl;
890 for (
Long64_t ievt=0; ievt<nevents; ievt++) {
895 Log() << kINFO <<
"--> could not determine scaling factors as either there are " <<
Endl;
896 Log() << kINFO <<
" no signal events (sumSigW="<<sumSigW<<
") or no bkg ev. (sumBkgW="<<sumBkgW<<
")"<<
Endl;
930 std::vector<const Event*> negEvents;
940 if (totalNegWeights == 0 ) {
941 Log() << kINFO <<
"no negative event weights found .. no preprocessing necessary" <<
Endl;
944 Log() << kINFO <<
"found a total of " << totalNegWeights <<
" of negative event weights which I am going to try to pair with positive events to annihilate them" <<
Endl;
945 Log() << kINFO <<
"found a total of " << totalPosWeights <<
" of events with positive weights" <<
Endl;
946 Log() << kINFO <<
"--> total sum of weights = " << totalWeights <<
" = " << totalNegWeights+totalPosWeights <<
Endl;
953 for (
Int_t i=0; i<2; i++){
954 invCov = ((*cov)[i]);
956 std::cout <<
"<MethodBDT::PreProcessNeg...> matrix is almost singular with determinant=" 958 <<
" did you use the variables that are linear combinations or highly correlated?" 962 std::cout <<
"<MethodBDT::PreProcessNeg...> matrix is singular with determinant=" 964 <<
" did you use the variables that are linear combinations?" 973 Log() << kINFO <<
"Found a total of " << totalNegWeights <<
" in negative weights out of " <<
fEventSample.size() <<
" training events " <<
Endl;
974 Timer timer(negEvents.size(),
"Negative Event paired");
975 for (
UInt_t nev = 0; nev < negEvents.size(); nev++){
976 timer.DrawProgressBar( nev );
977 Double_t weight = negEvents[nev]->GetWeight();
978 UInt_t iClassID = negEvents[nev]->GetClass();
979 invCov = ((*cov)[iClassID]);
990 dist += (negEvents[nev]->GetValue(ivar)-
fEventSample[iev]->GetValue(ivar))*
991 (*invCov)[ivar][jvar]*
992 (negEvents[nev]->GetValue(jvar)-
fEventSample[iev]->GetValue(jvar));
995 if (dist < minDist) { iMin=iev; minDist=
dist;}
1003 negEvents[nev]->SetBoostWeight( 0 );
1006 negEvents[nev]->SetBoostWeight( newWeight/negEvents[nev]->GetOriginalWeight() );
1010 }
else Log() << kFATAL <<
"preprocessing didn't find event to pair with the negative weight ... probably a bug" <<
Endl;
1011 weight = negEvents[nev]->GetWeight();
1014 Log() << kINFO <<
"<Negative Event Pairing> took: " <<
timer.GetElapsedTime()
1018 totalNegWeights = 0;
1019 totalPosWeights = 0;
1026 std::vector<const Event*> newEventSample;
1047 if (totalNegWeights < 0)
Log() << kFATAL <<
" compensation of negative event weights with positive ones did not work " << totalNegWeights <<
Endl;
1052 Log() << kINFO <<
" after PreProcessing, the Event sample is left with " <<
fEventSample.size() <<
" events (unweighted), all with positive weights, adding up to " << totalWeights <<
Endl;
1053 Log() << kINFO <<
" nSig="<<nSig <<
" sigWeight="<<sigWeight <<
" nBkg="<<nBkg <<
" bkgWeight="<<bkgWeight <<
Endl;
1065 std::map<TString,TMVA::Interval*> tuneParameters;
1066 std::map<TString,Double_t> tunedParameters;
1075 tuneParameters.insert(std::pair<TString,Interval*>(
"NTrees",
new Interval(10,1000,5)));
1076 tuneParameters.insert(std::pair<TString,Interval*>(
"MaxDepth",
new Interval(2,4,3)));
1077 tuneParameters.insert(std::pair<TString,Interval*>(
"MinNodeSize",
new LogInterval(1,30,30)));
1083 tuneParameters.insert(std::pair<TString,Interval*>(
"AdaBoostBeta",
new Interval(.2,1.,5)));
1086 tuneParameters.insert(std::pair<TString,Interval*>(
"Shrinkage",
new Interval(0.05,0.50,5)));
1091 tuneParameters.insert(std::pair<TString,Interval*>(
"UseNvars",
new Interval(min_var,max_var,4)));
1095 Log()<<kINFO <<
" the following BDT parameters will be tuned on the respective *grid*\n"<<
Endl;
1096 std::map<TString,TMVA::Interval*>::iterator it;
1097 for(it=tuneParameters.begin(); it!= tuneParameters.end(); it++){
1098 Log() << kWARNING << it->first <<
Endl;
1099 std::ostringstream oss;
1100 (it->second)->
Print(oss);
1106 tunedParameters=optimize.
optimize();
1108 return tunedParameters;
1117 std::map<TString,Double_t>::iterator it;
1118 for(it=tuneParameters.begin(); it!= tuneParameters.end(); it++){
1119 Log() << kWARNING << it->first <<
" = " << it->second <<
Endl;
1121 else if (it->first ==
"MinNodeSize" )
SetMinNodeSize (it->second);
1125 else if (it->first ==
"Shrinkage" )
SetShrinkage (it->second);
1128 else Log() << kFATAL <<
" SetParameter for " << it->first <<
" not yet implemented " <<
Endl;
1145 Log() << kERROR <<
" Zero Decision Trees demanded... that does not work !! " 1146 <<
" I set it to 1 .. just so that the program does not crash" 1152 std::vector<TString> titles = {
"Boost weight",
"Error Fraction"};
1160 if (
IsNormalised())
Log() << kFATAL <<
"\"Normalise\" option cannot be used with BDT; " 1161 <<
"please remove the option from the configuration string, or " 1162 <<
"use \"!Normalise\"" 1168 Log() << kINFO <<
"Training "<<
fNTrees <<
" Decision Trees ... patience please" <<
Endl;
1170 Log() << kDEBUG <<
"Training with maximal depth = " <<
fMaxDepth 1180 TString hname =
"AdaBooost weight distribution";
1190 hname=
"Boost event weights distribution";
1205 results->
Store(h,
"BoostWeights");
1210 TH2* boostMonitor =
new TH2F(
"BoostMonitor",
"ROC Integral Vs iTree",2,0,
fNTrees,2,0,1.05);
1212 boostMonitor->
SetYTitle(
"ROC Integral");
1213 results->
Store(boostMonitor,
"BoostMonitor");
1215 boostMonitorGraph->
SetName(
"BoostMonitorGraph");
1216 boostMonitorGraph->
SetTitle(
"ROCIntegralVsNTrees");
1217 results->
Store(boostMonitorGraph,
"BoostMonitorGraph");
1224 results->
Store(h,
"BoostWeightsVsTree");
1230 results->
Store(h,
"ErrorFrac");
1233 nodesBeforePruningVsTree->
SetXTitle(
"#tree");
1234 nodesBeforePruningVsTree->
SetYTitle(
"#tree nodes");
1235 results->
Store(nodesBeforePruningVsTree);
1238 nodesAfterPruningVsTree->
SetXTitle(
"#tree");
1239 nodesAfterPruningVsTree->
SetYTitle(
"#tree nodes");
1240 results->
Store(nodesAfterPruningVsTree);
1250 Int_t nNodesBeforePruningCount = 0;
1251 Int_t nNodesAfterPruningCount = 0;
1253 Int_t nNodesBeforePruning = 0;
1254 Int_t nNodesAfterPruning = 0;
1264 while (itree <
fNTrees && continueBoost){
1280 Log() << kFATAL <<
"Multiclass is currently only supported by gradient boost. " 1281 <<
"Please change boost option accordingly (GradBoost)." 1285 for (
UInt_t i=0;i<nClasses;i++){
1291 fForest.back()->SetUseFisherCuts();
1303 Log() << kWARNING <<
"stopped boosting at itree="<<itree <<
Endl;
1315 fForest.back()->SetUseFisherCuts();
1323 nNodesBeforePruning =
fForest.back()->CleanTree();
1326 nNodesBeforePruningCount += nNodesBeforePruning;
1327 nodesBeforePruningVsTree->
SetBinContent(itree+1,nNodesBeforePruning);
1332 std::vector<const Event*> * validationSample =
NULL;
1340 Log() << kWARNING <<
"stopped boosting at itree="<<itree <<
Endl;
1354 nNodesAfterPruning =
fForest.back()->GetNNodes();
1355 nNodesAfterPruningCount += nNodesAfterPruning;
1356 nodesAfterPruningVsTree->
SetBinContent(itree+1,nNodesAfterPruning);
1365 if ( itree==
fNTrees-1 || (!(itree%500)) ||
1366 (!(itree%250) && itree <1000)||
1367 (!(itree%100) && itree < 500)||
1368 (!(itree%50) && itree < 250)||
1369 (!(itree%25) && itree < 150)||
1370 (!(itree%10) && itree < 50)||
1371 (!(itree%5) && itree < 20)
1383 Log() << kDEBUG <<
"\t<Train> average number of nodes (w/o pruning) : " 1387 Log() << kDEBUG <<
"\t<Train> average number of nodes before/after pruning : " 1388 << nNodesBeforePruningCount/
GetNTrees() <<
" / " 1397 Log() << kDEBUG <<
"Now I delete the privat data sample"<<
Endl;
1414 for (
UInt_t itree=0; itree<nTrees; itree++) {
1419 return 2.0/(1.0+
exp(-2.0*sum))-1;
1429 std::vector<Double_t> expCache;
1430 if (cls == nClasses - 1) {
1431 expCache.resize(nClasses);
1433 for (
auto e : eventSample) {
1435 if (cls == nClasses - 1) {
1437 std::transform(residualsThisEvent.begin(),
1438 residualsThisEvent.begin() + nClasses,
1439 expCache.begin(), [](
Double_t d) {
return exp(d); });
1440 for (
UInt_t i = 0; i < nClasses; i++) {
1442 for (
UInt_t j = 0; j < nClasses; j++) {
1444 norm += expCache[j] / expCache[i];
1448 Double_t res = (
e->GetClass() == i) ? (1.0 - p_cls) : (-p_cls);
1454 for (
auto e : eventSample) {
1457 Double_t p_sig = 1.0 / (1.0 +
exp(-2.0 * residualAt0));
1488 std::unordered_map<TMVA::DecisionTreeNode*, LeafInfo> leaves;
1489 for (
auto e : eventSample) {
1492 auto &
v = leaves[node];
1493 auto target =
e->GetTarget(cls);
1494 v.sumWeightTarget += target * weight;
1495 v.sum2 +=
fabs(target) * (1.0-
fabs(target)) * weight * weight;
1497 for (
auto &iLeave : leaves) {
1498 constexpr
auto minValue = 1
e-30;
1499 if (iLeave.second.sum2 < minValue) {
1500 iLeave.second.sum2 = minValue;
1502 iLeave.first->SetResponse(
fShrinkage/
DataInfo().GetNClasses() * iLeave.second.sumWeightTarget/iLeave.second.sum2);
1518 std::map<TMVA::DecisionTreeNode*,vector< TMVA::LossFunctionEventInfo > > leaves;
1519 for (std::vector<const TMVA::Event*>::const_iterator
e=eventSample.begin();
e!=eventSample.end();
e++) {
1526 for (std::map<
TMVA::DecisionTreeNode*,vector< TMVA::LossFunctionEventInfo > >::iterator iLeave=leaves.begin();
1527 iLeave!=leaves.end();++iLeave){
1529 (iLeave->first)->SetResponse(
fShrinkage*fit);
1546 for (std::vector<const TMVA::Event*>::const_iterator
e=eventSample.begin();
e!=eventSample.end();
e++) {
1556 for (std::vector<const TMVA::Event*>::const_iterator
e=eventSample.begin();
e!=eventSample.end();
e++) {
1557 for (
UInt_t i=0;i<nClasses;i++){
1559 Double_t r = (*e)->GetClass()==i?(1-1.0/nClasses):(-1.0/nClasses);
1566 for (std::vector<const TMVA::Event*>::const_iterator
e=eventSample.begin();
e!=eventSample.end();
e++) {
1591 return ncorrect / (ncorrect + nfalse);
1611 returnVal = this->
GradBoost (eventSample, dt, cls);
1613 returnVal = this->
GradBoost (eventSample, dt);
1617 Log() << kFATAL <<
"<Boost> unknown boost option " <<
fBoostType<<
" called" <<
Endl;
1636 TH1F *tmpS =
new TH1F(
"tmpS",
"", 100 , -1., 1.00001 );
1637 TH1F *tmpB =
new TH1F(
"tmpB",
"", 100 , -1., 1.00001 );
1652 for (
UInt_t iev=0; iev < nevents; iev++){
1655 if (event->GetClass() == signalClassNr) {tmp=tmpS;}
1661 std::vector<TH1F*> hS;
1662 std::vector<TH1F*> hB;
1664 hS.push_back(
new TH1F(
Form(
"SigVar%dAtTree%d",ivar,iTree),
Form(
"SigVar%dAtTree%d",ivar,iTree),100,
DataInfo().GetVariableInfo(ivar).GetMin(),
DataInfo().GetVariableInfo(ivar).GetMax()));
1665 hB.push_back(
new TH1F(
Form(
"BkgVar%dAtTree%d",ivar,iTree),
Form(
"BkgVar%dAtTree%d",ivar,iTree),100,
DataInfo().GetVariableInfo(ivar).GetMin(),
DataInfo().GetVariableInfo(ivar).GetMax()));
1666 results->
Store(hS.back(),hS.back()->GetTitle());
1667 results->
Store(hB.back(),hB.back()->GetTitle());
1674 TH1F *tmpBoostWeightsS =
new TH1F(
Form(
"BoostWeightsInTreeS%d",iTree),
Form(
"BoostWeightsInTreeS%d",iTree),100,0.,max);
1675 TH1F *tmpBoostWeightsB =
new TH1F(
Form(
"BoostWeightsInTreeB%d",iTree),
Form(
"BoostWeightsInTreeB%d",iTree),100,0.,max);
1676 results->
Store(tmpBoostWeightsS,tmpBoostWeightsS->
GetTitle());
1677 results->
Store(tmpBoostWeightsB,tmpBoostWeightsB->
GetTitle());
1679 TH1F *tmpBoostWeights;
1680 std::vector<TH1F*> *
h;
1684 tmpBoostWeights=tmpBoostWeightsS;
1687 tmpBoostWeights=tmpBoostWeightsB;
1728 Double_t err=0, sumGlobalw=0, sumGlobalwfalse=0, sumGlobalwfalse2=0;
1730 std::vector<Double_t> sumw(
DataInfo().GetNClasses(),0);
1733 for (std::vector<const TMVA::Event*>::const_iterator
e=eventSample.begin();
e!=eventSample.end();
e++) {
1736 UInt_t iclass=(*e)->GetClass();
1741 sumGlobalwfalse += w * tmpDev;
1742 sumGlobalwfalse2 += w * tmpDev*tmpDev;
1743 if (tmpDev > maxDev) maxDev = tmpDev;
1748 if (!(isSignalType ==
DataInfo().IsSignal(*
e))) {
1749 sumGlobalwfalse+= w;
1756 sumGlobalwfalse+= w*trueType*dtoutput;
1761 err = sumGlobalwfalse/sumGlobalw ;
1765 err = sumGlobalwfalse/maxDev/sumGlobalw ;
1768 err = sumGlobalwfalse2/maxDev/maxDev/sumGlobalw ;
1772 for (std::vector<const TMVA::Event*>::const_iterator
e=eventSample.begin();
e!=eventSample.end();
e++) {
1775 err += w * (1 -
exp (-tmpDev/maxDev)) / sumGlobalw;
1780 Log() << kFATAL <<
" you've chosen a Loss type for Adaboost other than linear, quadratic or exponential " 1782 <<
"and this is not implemented... a typo in the options ??" <<
Endl;
1786 Log() << kDEBUG <<
"BDT AdaBoos wrong/all: " << sumGlobalwfalse <<
"/" << sumGlobalw <<
Endl;
1790 std::vector<Double_t> newSumw(sumw.size(),0);
1797 Log() << kERROR <<
" YOUR tree has only 1 Node... kind of a funny *tree*. I cannot " 1798 <<
"boost such a thing... if after 1 step the error rate is == 0.5" 1800 <<
"please check why this happens, maybe too many events per node requested ?" 1804 Log() << kERROR <<
" The error rate in the BDT boosting is > 0.5. ("<< err
1805 <<
") That should not happen, please check your code (i.e... the BDT code), I " 1806 <<
" stop boosting here" <<
Endl;
1810 }
else if (err < 0) {
1811 Log() << kERROR <<
" The error rate in the BDT boosting is < 0. That can happen" 1812 <<
" due to improper treatment of negative weights in a Monte Carlo.. (if you have" 1813 <<
" an idea on how to do it in a better way, please let me know (Helge.Voss@cern.ch)" 1814 <<
" for the time being I set it to its absolute value.. just to continue.." <<
Endl;
1823 Log() << kDEBUG <<
"BDT AdaBoos wrong/all: " << sumGlobalwfalse <<
"/" << sumGlobalw <<
" 1-err/err="<<boostWeight<<
" log.."<<
TMath::Log(boostWeight)<<
Endl;
1828 for (std::vector<const TMVA::Event*>::const_iterator
e=eventSample.begin();
e!=eventSample.end();
e++) {
1835 if ( (*e)->GetWeight() > 0 ){
1836 (*e)->SetBoostWeight( (*e)->GetBoostWeight() * boostfactor);
1841 else (*e)->SetBoostWeight( (*e)->GetBoostWeight() * boostfactor);
1853 if ( (*e)->GetWeight() > 0 ){
1854 (*e)->SetBoostWeight( (*e)->GetBoostWeight() * boostfactor);
1859 else (*e)->SetBoostWeight( (*e)->GetBoostWeight() * boostfactor);
1862 newSumGlobalw+=(*e)->GetWeight();
1863 newSumw[(*e)->GetClass()] += (*e)->GetWeight();
1869 Log() << kDEBUG <<
"new Nsig="<<newSumw[0]*globalNormWeight <<
" new Nbkg="<<newSumw[1]*globalNormWeight <<
Endl;
1872 for (std::vector<const TMVA::Event*>::const_iterator
e=eventSample.begin();
e!=eventSample.end();
e++) {
1877 else (*e)->ScaleBoostWeight( globalNormWeight );
1911 Double_t err=0, sumGlobalWeights=0, sumGlobalCost=0;
1913 std::vector<Double_t> sumw(
DataInfo().GetNClasses(),0);
1915 for (vector<const TMVA::Event*>::const_iterator
e=eventSample.begin();
e!=eventSample.end();
e++) {
1917 sumGlobalWeights += w;
1918 UInt_t iclass=(*e)->GetClass();
1923 Log() << kFATAL <<
" AdaCost not implemented for regression"<<
Endl;
1929 Bool_t isSelectedSignal = (dtoutput>0);
1930 if (isTrueSignal) trueType = 1;
1934 if (isTrueSignal && isSelectedSignal) cost=Css;
1935 else if (isTrueSignal && !isSelectedSignal) cost=Cts_sb;
1936 else if (!isTrueSignal && isSelectedSignal) cost=Ctb_ss;
1937 else if (!isTrueSignal && !isSelectedSignal) cost=Cbb;
1938 else Log() << kERROR <<
"something went wrong in AdaCost" <<
Endl;
1940 sumGlobalCost+= w*trueType*dtoutput*cost;
1946 Log() << kFATAL <<
" AdaCost not implemented for regression"<<
Endl;
1951 sumGlobalCost /= sumGlobalWeights;
1956 vector<Double_t> newSumClassWeights(sumw.size(),0);
1962 for (vector<const TMVA::Event*>::const_iterator
e=eventSample.begin();
e!=eventSample.end();
e++) {
1966 Bool_t isSelectedSignal = (dtoutput>0);
1967 if (isTrueSignal) trueType = 1;
1971 if (isTrueSignal && isSelectedSignal) cost=Css;
1972 else if (isTrueSignal && !isSelectedSignal) cost=Cts_sb;
1973 else if (!isTrueSignal && isSelectedSignal) cost=Ctb_ss;
1974 else if (!isTrueSignal && !isSelectedSignal) cost=Cbb;
1975 else Log() << kERROR <<
"something went wrong in AdaCost" <<
Endl;
1979 if ( (*e)->GetWeight() > 0 ){
1980 (*e)->SetBoostWeight( (*e)->GetBoostWeight() * boostfactor);
1987 newSumGlobalWeights+=(*e)->GetWeight();
1988 newSumClassWeights[(*e)->GetClass()] += (*e)->GetWeight();
1993 Double_t globalNormWeight=
Double_t(eventSample.size())/newSumGlobalWeights;
1994 Log() << kDEBUG <<
"new Nsig="<<newSumClassWeights[0]*globalNormWeight <<
" new Nbkg="<<newSumClassWeights[1]*globalNormWeight <<
Endl;
1997 for (std::vector<const TMVA::Event*>::const_iterator
e=eventSample.begin();
e!=eventSample.end();
e++) {
2001 else (*e)->ScaleBoostWeight( globalNormWeight );
2039 for (std::vector<const TMVA::Event*>::const_iterator
e=eventSample.begin();
e!=eventSample.end();
e++) {
2075 if ( !
DoRegression() )
Log() << kFATAL <<
"Somehow you chose a regression boost method for a classification job" <<
Endl;
2077 Double_t err=0, sumw=0, sumwfalse=0, sumwfalse2=0;
2079 for (std::vector<const TMVA::Event*>::const_iterator
e=eventSample.begin();
e!=eventSample.end();
e++) {
2084 sumwfalse += w * tmpDev;
2085 sumwfalse2 += w * tmpDev*tmpDev;
2086 if (tmpDev > maxDev) maxDev = tmpDev;
2091 err = sumwfalse/maxDev/sumw ;
2094 err = sumwfalse2/maxDev/maxDev/sumw ;
2098 for (std::vector<const TMVA::Event*>::const_iterator
e=eventSample.begin();
e!=eventSample.end();
e++) {
2101 err += w * (1 -
exp (-tmpDev/maxDev)) / sumw;
2106 Log() << kFATAL <<
" you've chosen a Loss type for Adaboost other than linear, quadratic or exponential " 2108 <<
"and this is not implemented... a typo in the options ??" <<
Endl;
2116 Log() << kERROR <<
" YOUR tree has only 1 Node... kind of a funny *tree*. I cannot " 2117 <<
"boost such a thing... if after 1 step the error rate is == 0.5" 2119 <<
"please check why this happens, maybe too many events per node requested ?" 2123 Log() << kERROR <<
" The error rate in the BDT boosting is > 0.5. ("<< err
2124 <<
") That should not happen, but is possible for regression trees, and" 2125 <<
" should trigger a stop for the boosting. please check your code (i.e... the BDT code), I " 2126 <<
" stop boosting " <<
Endl;
2130 }
else if (err < 0) {
2131 Log() << kERROR <<
" The error rate in the BDT boosting is < 0. That can happen" 2132 <<
" due to improper treatment of negative weights in a Monte Carlo.. (if you have" 2133 <<
" an idea on how to do it in a better way, please let me know (Helge.Voss@cern.ch)" 2134 <<
" for the time being I set it to its absolute value.. just to continue.." <<
Endl;
2138 Double_t boostWeight = err / (1.-err);
2143 for (std::vector<const TMVA::Event*>::const_iterator
e=eventSample.begin();
e!=eventSample.end();
e++) {
2145 results->
GetHist(
"BoostWeights")->
Fill(boostfactor);
2147 if ( (*e)->GetWeight() > 0 ){
2148 Float_t newBoostWeight = (*e)->GetBoostWeight() * boostfactor;
2149 Float_t newWeight = (*e)->GetWeight() * (*e)->GetBoostWeight() * boostfactor;
2150 if (newWeight == 0) {
2151 Log() << kINFO <<
"Weight= " << (*e)->GetWeight() <<
Endl;
2152 Log() << kINFO <<
"BoostWeight= " << (*e)->GetBoostWeight() <<
Endl;
2153 Log() << kINFO <<
"boostweight="<<boostWeight <<
" err= " <<err <<
Endl;
2154 Log() << kINFO <<
"NewBoostWeight= " << newBoostWeight <<
Endl;
2155 Log() << kINFO <<
"boostfactor= " << boostfactor <<
Endl;
2156 Log() << kINFO <<
"maxDev = " << maxDev <<
Endl;
2158 Log() << kINFO <<
"target = " << (*e)->GetTarget(0) <<
Endl;
2161 (*e)->SetBoostWeight( newBoostWeight );
2164 (*e)->SetBoostWeight( (*e)->GetBoostWeight() / boostfactor);
2166 newSumw+=(*e)->GetWeight();
2170 Double_t normWeight = sumw / newSumw;
2171 for (std::vector<const TMVA::Event*>::const_iterator
e=eventSample.begin();
e!=eventSample.end();
e++) {
2174 (*e)->SetBoostWeight( (*e)->GetBoostWeight() * normWeight );
2212 void* trxml =
fForest[i]->AddXMLTo(wght);
2232 if (
gTools().HasAttr( parent,
Form(
"PreselectionLowBkgVar%d",0))) {
2267 if(
gTools().HasAttr(parent,
"TreeType")) {
2278 fForest.back()->SetTreeID(i++);
2292 Int_t analysisType(0);
2296 Log() << kINFO <<
"Read " << fNTrees <<
" Decision trees" <<
Endl;
2304 istr >> dummy >> iTree >> dummy >> boostWeight;
2306 fForest.back()->Print( std::cout );
2307 Log() << kFATAL <<
"Error while reading weight file; mismatch iTree=" 2308 << iTree <<
" i=" << i
2309 <<
" dummy " << dummy
2310 <<
" boostweight " << boostWeight
2357 if (useNTrees > 0 ) nTrees = useNTrees;
2363 for (
UInt_t itree=0; itree<nTrees; itree++) {
2382 std::vector<Double_t> temp(nClasses);
2383 auto forestSize =
fForest.size();
2387 for (
UInt_t itree = 0; itree < forestSize; ++itree) {
2389 if (++classOfTree == nClasses) classOfTree = 0;
2394 std::transform(temp.begin(), temp.end(), temp.begin(), [](
Double_t d){
return exp(d);});
2396 for(
UInt_t iClass=0; iClass<nClasses; iClass++){
2398 for(
UInt_t j=0;j<nClasses;j++){
2400 norm += temp[j] / temp[iClass];
2402 (*fMulticlassReturnVal).push_back(1.0/(1.0+norm));
2433 vector< Double_t > response(
fForest.size());
2434 vector< Double_t > weight(
fForest.size());
2443 std::vector< std::vector<Double_t> > vtemp;
2444 vtemp.push_back( response );
2445 vtemp.push_back( weight );
2450 while (sumOfWeights <= totalSumOfWeights/2.) {
2451 sumOfWeights += vtemp[1][t];
2521 for (
UInt_t i=0; i< relativeImportance.size(); i++) {
2543 if (ivar < (
UInt_t)relativeImportance.size())
return relativeImportance[ivar];
2544 else Log() << kFATAL <<
"<GetVariableImportance> ivar = " << ivar <<
" is out of range " <<
Endl;
2574 Log() <<
"Boosted Decision Trees are a collection of individual decision" <<
Endl;
2575 Log() <<
"trees which form a multivariate classifier by (weighted) majority " <<
Endl;
2576 Log() <<
"vote of the individual trees. Consecutive decision trees are " <<
Endl;
2577 Log() <<
"trained using the original training data set with re-weighted " <<
Endl;
2578 Log() <<
"events. By default, the AdaBoost method is employed, which gives " <<
Endl;
2579 Log() <<
"events that were misclassified in the previous tree a larger " <<
Endl;
2580 Log() <<
"weight in the training of the following tree." <<
Endl;
2582 Log() <<
"Decision trees are a sequence of binary splits of the data sample" <<
Endl;
2583 Log() <<
"using a single discriminant variable at a time. A test event " <<
Endl;
2584 Log() <<
"ending up after the sequence of left-right splits in a final " <<
Endl;
2585 Log() <<
"(\"leaf\") node is classified as either signal or background" <<
Endl;
2586 Log() <<
"depending on the majority type of training events in that node." <<
Endl;
2590 Log() <<
"By the nature of the binary splits performed on the individual" <<
Endl;
2591 Log() <<
"variables, decision trees do not deal well with linear correlations" <<
Endl;
2592 Log() <<
"between variables (they need to approximate the linear split in" <<
Endl;
2593 Log() <<
"the two dimensional space by a sequence of splits on the two " <<
Endl;
2594 Log() <<
"variables individually). Hence decorrelation could be useful " <<
Endl;
2595 Log() <<
"to optimise the BDT performance." <<
Endl;
2599 Log() <<
"The two most important parameters in the configuration are the " <<
Endl;
2600 Log() <<
"minimal number of events requested by a leaf node as percentage of the " <<
Endl;
2601 Log() <<
" number of training events (option \"MinNodeSize\" replacing the actual number " <<
Endl;
2602 Log() <<
" of events \"nEventsMin\" as given in earlier versions" <<
Endl;
2603 Log() <<
"If this number is too large, detailed features " <<
Endl;
2604 Log() <<
"in the parameter space are hard to be modelled. If it is too small, " <<
Endl;
2605 Log() <<
"the risk to overtrain rises and boosting seems to be less effective" <<
Endl;
2606 Log() <<
" typical values from our current experience for best performance " <<
Endl;
2607 Log() <<
" are between 0.5(%) and 10(%) " <<
Endl;
2609 Log() <<
"The default minimal number is currently set to " <<
Endl;
2610 Log() <<
" max(20, (N_training_events / N_variables^2 / 10)) " <<
Endl;
2611 Log() <<
"and can be changed by the user." <<
Endl;
2613 Log() <<
"The other crucial parameter, the pruning strength (\"PruneStrength\")," <<
Endl;
2614 Log() <<
"is also related to overtraining. It is a regularisation parameter " <<
Endl;
2615 Log() <<
"that is used when determining after the training which splits " <<
Endl;
2616 Log() <<
"are considered statistically insignificant and are removed. The" <<
Endl;
2617 Log() <<
"user is advised to carefully watch the BDT screen output for" <<
Endl;
2618 Log() <<
"the comparison between efficiencies obtained on the training and" <<
Endl;
2619 Log() <<
"the independent test sample. They should be equal within statistical" <<
Endl;
2620 Log() <<
"errors, in order to minimize statistical fluctuations in different samples." <<
Endl;
2632 fout <<
" std::vector<"<<nodeName<<
"*> fForest; // i.e. root nodes of decision trees" << std::endl;
2633 fout <<
" std::vector<double> fBoostWeights; // the weights applied in the individual boosts" << std::endl;
2634 fout <<
"};" << std::endl << std::endl;
2635 fout <<
"double " << className <<
"::GetMvaValue__( const std::vector<double>& inputValues ) const" << std::endl;
2636 fout <<
"{" << std::endl;
2637 fout <<
" double myMVA = 0;" << std::endl;
2641 fout <<
" if (inputValues["<<ivar<<
"] < " <<
fLowBkgCut[ivar] <<
") return -1; // is background preselection cut" << std::endl;
2644 fout <<
" if (inputValues["<<ivar<<
"] < "<<
fLowSigCut[ivar] <<
") return 1; // is signal preselection cut" << std::endl;
2647 fout <<
" if (inputValues["<<ivar<<
"] > "<<
fHighBkgCut[ivar] <<
") return -1; // is background preselection cut" << std::endl;
2650 fout <<
" if (inputValues["<<ivar<<
"] > "<<
fHighSigCut[ivar]<<
") return 1; // is signal preselection cut" << std::endl;
2656 fout <<
" double norm = 0;" << std::endl;
2658 fout <<
" for (unsigned int itree=0; itree<fForest.size(); itree++){" << std::endl;
2659 fout <<
" "<<nodeName<<
" *current = fForest[itree];" << std::endl;
2660 fout <<
" while (current->GetNodeType() == 0) { //intermediate node" << std::endl;
2661 fout <<
" if (current->GoesRight(inputValues)) current=("<<nodeName<<
"*)current->GetRight();" << std::endl;
2662 fout <<
" else current=("<<nodeName<<
"*)current->GetLeft();" << std::endl;
2663 fout <<
" }" << std::endl;
2665 fout <<
" myMVA += current->GetResponse();" << std::endl;
2667 if (
fUseYesNoLeaf) fout <<
" myMVA += fBoostWeights[itree] * current->GetNodeType();" << std::endl;
2668 else fout <<
" myMVA += fBoostWeights[itree] * current->GetPurity();" << std::endl;
2669 fout <<
" norm += fBoostWeights[itree];" << std::endl;
2671 fout <<
" }" << std::endl;
2673 fout <<
" return 2.0/(1.0+exp(-2.0*myMVA))-1.0;" << std::endl;
2675 else fout <<
" return myMVA /= norm;" << std::endl;
2676 fout <<
"};" << std::endl << std::endl;
2677 fout <<
"void " << className <<
"::Initialize()" << std::endl;
2678 fout <<
"{" << std::endl;
2681 fout <<
" // itree = " << itree << std::endl;
2682 fout <<
" fBoostWeights.push_back(" <<
fBoostWeights[itree] <<
");" << std::endl;
2683 fout <<
" fForest.push_back( " << std::endl;
2685 fout <<
" );" << std::endl;
2687 fout <<
" return;" << std::endl;
2688 fout <<
"};" << std::endl;
2689 fout <<
" " << std::endl;
2690 fout <<
"// Clean up" << std::endl;
2691 fout <<
"inline void " << className <<
"::Clear() " << std::endl;
2692 fout <<
"{" << std::endl;
2693 fout <<
" for (unsigned int itree=0; itree<fForest.size(); itree++) { " << std::endl;
2694 fout <<
" delete fForest[itree]; " << std::endl;
2695 fout <<
" }" << std::endl;
2696 fout <<
"}" << std::endl;
2708 fout <<
"#define NN new "<<nodeName << std::endl;
2710 fout <<
" " << std::endl;
2711 fout <<
"#ifndef "<<nodeName<<
"__def" << std::endl;
2712 fout <<
"#define "<<nodeName<<
"__def" << std::endl;
2713 fout <<
" " << std::endl;
2714 fout <<
"class "<<nodeName<<
" {" << std::endl;
2715 fout <<
" " << std::endl;
2716 fout <<
"public:" << std::endl;
2717 fout <<
" " << std::endl;
2718 fout <<
" // constructor of an essentially \"empty\" node floating in space" << std::endl;
2719 fout <<
" "<<nodeName<<
" ( "<<nodeName<<
"* left,"<<nodeName<<
"* right," << std::endl;
2721 fout <<
" int nFisherCoeff," << std::endl;
2723 fout <<
" double fisherCoeff"<<i<<
"," << std::endl;
2726 fout <<
" int selector, double cutValue, bool cutType, " << std::endl;
2727 fout <<
" int nodeType, double purity, double response ) :" << std::endl;
2728 fout <<
" fLeft ( left )," << std::endl;
2729 fout <<
" fRight ( right )," << std::endl;
2730 if (
fUseFisherCuts) fout <<
" fNFisherCoeff ( nFisherCoeff )," << std::endl;
2731 fout <<
" fSelector ( selector )," << std::endl;
2732 fout <<
" fCutValue ( cutValue )," << std::endl;
2733 fout <<
" fCutType ( cutType )," << std::endl;
2734 fout <<
" fNodeType ( nodeType )," << std::endl;
2735 fout <<
" fPurity ( purity )," << std::endl;
2736 fout <<
" fResponse ( response ){" << std::endl;
2739 fout <<
" fFisherCoeff.push_back(fisherCoeff"<<i<<
");" << std::endl;
2742 fout <<
" }" << std::endl << std::endl;
2743 fout <<
" virtual ~"<<nodeName<<
"();" << std::endl << std::endl;
2744 fout <<
" // test event if it descends the tree at this node to the right" << std::endl;
2745 fout <<
" virtual bool GoesRight( const std::vector<double>& inputValues ) const;" << std::endl;
2746 fout <<
" "<<nodeName<<
"* GetRight( void ) {return fRight; };" << std::endl << std::endl;
2747 fout <<
" // test event if it descends the tree at this node to the left " << std::endl;
2748 fout <<
" virtual bool GoesLeft ( const std::vector<double>& inputValues ) const;" << std::endl;
2749 fout <<
" "<<nodeName<<
"* GetLeft( void ) { return fLeft; }; " << std::endl << std::endl;
2750 fout <<
" // return S/(S+B) (purity) at this node (from training)" << std::endl << std::endl;
2751 fout <<
" double GetPurity( void ) const { return fPurity; } " << std::endl;
2752 fout <<
" // return the node type" << std::endl;
2753 fout <<
" int GetNodeType( void ) const { return fNodeType; }" << std::endl;
2754 fout <<
" double GetResponse(void) const {return fResponse;}" << std::endl << std::endl;
2755 fout <<
"private:" << std::endl << std::endl;
2756 fout <<
" "<<nodeName<<
"* fLeft; // pointer to the left daughter node" << std::endl;
2757 fout <<
" "<<nodeName<<
"* fRight; // pointer to the right daughter node" << std::endl;
2759 fout <<
" int fNFisherCoeff; // =0 if this node doesn't use fisher, else =nvar+1 " << std::endl;
2760 fout <<
" std::vector<double> fFisherCoeff; // the fisher coeff (offset at the last element)" << std::endl;
2762 fout <<
" int fSelector; // index of variable used in node selection (decision tree) " << std::endl;
2763 fout <<
" double fCutValue; // cut value applied on this node to discriminate bkg against sig" << std::endl;
2764 fout <<
" bool fCutType; // true: if event variable > cutValue ==> signal , false otherwise" << std::endl;
2765 fout <<
" int fNodeType; // Type of node: -1 == Bkg-leaf, 1 == Signal-leaf, 0 = internal " << std::endl;
2766 fout <<
" double fPurity; // Purity of node from training"<< std::endl;
2767 fout <<
" double fResponse; // Regression response value of node" << std::endl;
2768 fout <<
"}; " << std::endl;
2769 fout <<
" " << std::endl;
2770 fout <<
"//_______________________________________________________________________" << std::endl;
2771 fout <<
" "<<nodeName<<
"::~"<<nodeName<<
"()" << std::endl;
2772 fout <<
"{" << std::endl;
2773 fout <<
" if (fLeft != NULL) delete fLeft;" << std::endl;
2774 fout <<
" if (fRight != NULL) delete fRight;" << std::endl;
2775 fout <<
"}; " << std::endl;
2776 fout <<
" " << std::endl;
2777 fout <<
"//_______________________________________________________________________" << std::endl;
2778 fout <<
"bool "<<nodeName<<
"::GoesRight( const std::vector<double>& inputValues ) const" << std::endl;
2779 fout <<
"{" << std::endl;
2780 fout <<
" // test event if it descends the tree at this node to the right" << std::endl;
2781 fout <<
" bool result;" << std::endl;
2783 fout <<
" if (fNFisherCoeff == 0){" << std::endl;
2784 fout <<
" result = (inputValues[fSelector] > fCutValue );" << std::endl;
2785 fout <<
" }else{" << std::endl;
2786 fout <<
" double fisher = fFisherCoeff.at(fFisherCoeff.size()-1);" << std::endl;
2787 fout <<
" for (unsigned int ivar=0; ivar<fFisherCoeff.size()-1; ivar++)" << std::endl;
2788 fout <<
" fisher += fFisherCoeff.at(ivar)*inputValues.at(ivar);" << std::endl;
2789 fout <<
" result = fisher > fCutValue;" << std::endl;
2790 fout <<
" }" << std::endl;
2792 fout <<
" result = (inputValues[fSelector] > fCutValue );" << std::endl;
2794 fout <<
" if (fCutType == true) return result; //the cuts are selecting Signal ;" << std::endl;
2795 fout <<
" else return !result;" << std::endl;
2796 fout <<
"}" << std::endl;
2797 fout <<
" " << std::endl;
2798 fout <<
"//_______________________________________________________________________" << std::endl;
2799 fout <<
"bool "<<nodeName<<
"::GoesLeft( const std::vector<double>& inputValues ) const" << std::endl;
2800 fout <<
"{" << std::endl;
2801 fout <<
" // test event if it descends the tree at this node to the left" << std::endl;
2802 fout <<
" if (!this->GoesRight(inputValues)) return true;" << std::endl;
2803 fout <<
" else return false;" << std::endl;
2804 fout <<
"}" << std::endl;
2805 fout <<
" " << std::endl;
2806 fout <<
"#endif" << std::endl;
2807 fout <<
" " << std::endl;
2816 Log() << kFATAL <<
"MakeClassInstantiateNode: started with undefined node" <<
Endl;
2819 fout <<
"NN("<<std::endl;
2826 fout <<
", " <<std::endl;
2833 fout <<
", " << std::endl
2834 << std::setprecision(6);
2861 Int_t nTotS_unWeighted = 0, nTotB_unWeighted = 0;
2863 std::vector<TMVA::BDTEventWrapper> bdtEventSample;
2878 for( std::vector<const TMVA::Event*>::const_iterator it = eventSample.begin(); it != eventSample.end(); ++it ) {
2880 nTotS += (*it)->GetWeight();
2884 nTotB += (*it)->GetWeight();
2892 std::sort( bdtEventSample.begin(),bdtEventSample.end() );
2894 Double_t bkgWeightCtr = 0.0, sigWeightCtr = 0.0;
2895 std::vector<TMVA::BDTEventWrapper>::iterator it = bdtEventSample.begin(), it_end = bdtEventSample.end();
2896 for( ; it != it_end; ++it ) {
2898 sigWeightCtr += (**it)->GetWeight();
2900 bkgWeightCtr += (**it)->GetWeight();
2902 it->SetCumulativeWeight(
false,bkgWeightCtr);
2903 it->SetCumulativeWeight(
true,sigWeightCtr);
2909 Double_t nSelS, nSelB, effS=0.05, effB=0.05, rejS=0.05, rejB=0.05;
2910 Double_t tmpEffS, tmpEffB, tmpRejS, tmpRejB;
2915 for(
UInt_t iev = 1; iev < bdtEventSample.size(); iev++) {
2918 nSelS = bdtEventSample[iev].GetCumulativeWeight(
true);
2919 nSelB = bdtEventSample[iev].GetCumulativeWeight(
false);
2921 tmpEffS=nSelS/nTotS;
2922 tmpEffB=nSelB/nTotB;
2926 else if (nSelB==0 && tmpEffS>effS) {effS=tmpEffS;
fLowSigCut[ivar] = bdtEventSample[iev].GetVal() - dVal;
fIsLowSigCut[ivar]=
kTRUE;}
2933 Log() << kDEBUG <<
" \tfound and suggest the following possible pre-selection cuts " <<
Endl;
2934 if (
fDoPreselection)
Log() << kDEBUG <<
"\tthe training will be done after these cuts... and GetMVA value returns +1, (-1) for a signal (bkg) event that passes these cuts" <<
Endl;
2935 else Log() << kDEBUG <<
"\tas option DoPreselection was not used, these cuts however will not be performed, but the training will see the full sample"<<
Endl;
2938 Log() << kDEBUG <<
" \tfound cut: Bkg if var " << ivar <<
" < " <<
fLowBkgCut[ivar] <<
Endl;
2941 Log() << kDEBUG <<
" \tfound cut: Sig if var " << ivar <<
" < " <<
fLowSigCut[ivar] <<
Endl;
2944 Log() << kDEBUG <<
" \tfound cut: Bkg if var " << ivar <<
" > " <<
fHighBkgCut[ivar] <<
Endl;
2947 Log() << kDEBUG <<
" \tfound cut: Sig if var " << ivar <<
" > " <<
fHighSigCut[ivar] <<
Endl;
Double_t AdaCost(std::vector< const TMVA::Event * > &, DecisionTree *dt)
The AdaCost boosting algorithm takes a simple cost Matrix (currently fixed for all events...
Types::EAnalysisType fAnalysisType
void Train(void)
BDT training.
virtual const char * GetTitle() const
Returns title of object.
void PreProcessNegativeEventWeights()
O.k.
Double_t AdaBoostR2(std::vector< const TMVA::Event * > &, DecisionTree *dt)
Adaption of the AdaBoost to regression problems (see H.Drucker 1997).
virtual Int_t Fill(Double_t x)
Increment bin with abscissa X by 1.
double dist(Rotation3D const &r1, Rotation3D const &r2)
static long int sum(long int i)
Long64_t GetNTestEvents() const
virtual Double_t Fit(std::vector< LossFunctionEventInfo > &evs)=0
Random number generator class based on M.
THist< 1, int, THistStatContent > TH1I
virtual Double_t PoissonD(Double_t mean)
Generates a random number according to a Poisson law.
MsgLogger & Endl(MsgLogger &ml)
Double_t Boost(std::vector< const TMVA::Event * > &, DecisionTree *dt, UInt_t cls=0)
Apply the boosting algorithm (the algorithm is selecte via the the "option" given in the constructor...
TH1 * GetHist(const TString &alias) const
Singleton class for Global types used by TMVA.
void WriteMonitoringHistosToFile(void) const
Here we could write some histograms created during the processing to the output file.
std::vector< Bool_t > fIsLowSigCut
void DeclareCompatibilityOptions()
Options that are used ONLY for the READER to ensure backward compatibility.
std::map< const TMVA::Event *, LossFunctionEventInfo > fLossFunctionEventInfo
Bool_t fPairNegWeightsGlobal
void AddPoint(Double_t x, Double_t y1, Double_t y2)
This function is used only in 2 TGraph case, and it will add new data points to graphs.
void SetUseNvars(Int_t n)
#define REGISTER_METHOD(CLASS)
for example
void AddWeightsXMLTo(void *parent) const
Write weights to XML.
Double_t GradBoost(std::vector< const TMVA::Event * > &, DecisionTree *dt, UInt_t cls=0)
Calculate the desired response value for each region.
const Ranking * CreateRanking()
Compute ranking of input variables.
virtual void Delete(Option_t *option="")
Delete this tree from memory or/and disk.
virtual void SetTargets(std::vector< const TMVA::Event * > &evs, std::map< const TMVA::Event *, LossFunctionEventInfo > &evinfomap)=0
void MakeClassSpecificHeader(std::ostream &, const TString &) const
Specific class header.
void BDT(TString dataset, const TString &fin="TMVA.root")
Absolute Deviation BDT Loss Function.
TString & ReplaceAll(const TString &s1, const TString &s2)
const char * GetName() const
Bool_t IgnoreEventsWithNegWeightsInTraining() const
virtual Int_t Fill()
Fill all branches.
virtual void SetName(const char *name)
Set the name of the TNamed.
UInt_t GetNClasses() const
THist< 1, float, THistStatContent, THistStatUncertainty > TH1F
std::vector< Bool_t > fIsHighSigCut
OptionBase * DeclareOptionRef(T &ref, const TString &name, const TString &desc="")
void DeclareOptions()
Define the options (their key words).
std::vector< Double_t > fVariableImportance
Double_t Atof() const
Return floating-point value contained in string.
UInt_t GetNFisherCoeff() const
const Event * GetTestingEvent(Long64_t ievt) const
Double_t fMinLinCorrForFisher
Virtual base Class for all MVA method.
std::vector< const TMVA::Event * > fEventSample
virtual DecisionTreeNode * GetRight() const
TMVA::DecisionTreeNode * GetEventNode(const TMVA::Event &e) const
get the pointer to the leaf node where a particular event ends up in...
Double_t Bagging()
Call it boot-strapping, re-sampling or whatever you like, in the end it is nothing else but applying ...
Bool_t IsNormalised() const
tomato 1-D histogram with a float per channel (see TH1 documentation)}
TransformationHandler & GetTransformationHandler(Bool_t takeReroutedIfAvailable=true)
Ranking for variables in method (implementation)
Short_t Min(Short_t a, Short_t b)
void ToLower()
Change string to lower-case.
Int_t GetNodeType(void) const
virtual void SetYTitle(const char *title)
virtual void SetTitle(const char *title="")
Set graph title.
void DeterminePreselectionCuts(const std::vector< const TMVA::Event * > &eventSample)
Find useful preselection cuts that will be applied before and Decision Tree training.
void ProcessOptions()
The option string is decoded, for available options see "DeclareOptions".
void UpdateTargetsRegression(std::vector< const TMVA::Event * > &, Bool_t first=kFALSE)
Calculate current residuals for all events and update targets for next iteration. ...
Int_t FloorNint(Double_t x)
virtual DecisionTreeNode * GetLeft() const
std::vector< Bool_t > fIsHighBkgCut
void SetShrinkage(Double_t s)
virtual const char * GetPath() const
Returns the full path of the directory.
TString fRegressionLossFunctionBDTGS
Double_t GetMvaValue(Double_t *err=0, Double_t *errUpper=0)
LongDouble_t Power(LongDouble_t x, LongDouble_t y)
const TString & GetMethodName() const
Bool_t IsConstructedFromWeightFile() const
Float_t GetValue(UInt_t ivar) const
return value of i'th variable
void GetHelpMessage() const
Get help message text.
std::vector< Double_t > fHighBkgCut
Double_t GetGradBoostMVA(const TMVA::Event *e, UInt_t nTrees)
Returns MVA value: -1 for background, 1 for signal.
Types::EAnalysisType GetAnalysisType() const
Double_t fBaggedSampleFraction
Implementation of the CrossEntropy as separation criterion.
Bool_t fInverseBoostNegWeights
virtual void SetTuneParameters(std::map< TString, Double_t > tuneParameters)
Set the tuning parameters according to the argument.
virtual Double_t Determinant() const
Bool_t IsSignal(const Event *ev) const
Float_t GetPurity(void) const
Bool_t GetCutType(void) const
Double_t fSigToBkgFraction
virtual Bool_t HasAnalysisType(Types::EAnalysisType type, UInt_t numberClasses, UInt_t numberTargets)
BDT can handle classification with multiple classes and regression with one regression-target.
void Reset(void)
Reset the method, as if it had just been instantiated (forget all training etc.). ...
Bool_t DoMulticlass() const
TString & Append(const char *cs)
Double_t RegBoost(std::vector< const TMVA::Event * > &, DecisionTree *dt)
A special boosting only for Regression (not implemented).
void SetMinNodeSize(Double_t sizeInPercent)
void Init(std::vector< TString > &graphTitles)
This function gets some title and it creates a TGraph for every title.
std::vector< Double_t > fHighSigCut
Class that contains all the data information.
Least Squares BDT Loss Function.
Implementation of the SdivSqrtSplusB as separation criterion.
PDF wrapper for histograms; uses user-defined spline interpolation.
void MakeClassInstantiateNode(DecisionTreeNode *n, std::ostream &fout, const TString &className) const
Recursively descends a tree and writes the node instance to the output stream.
const std::vector< Float_t > & GetMulticlassValues()
Get the multiclass MVA response for the BDT classifier.
Double_t GradBoostRegression(std::vector< const TMVA::Event * > &, DecisionTree *dt)
Implementation of M_TreeBoost using any loss function as described by Friedman 1999.
Implementation of the MisClassificationError as separation criterion.
void InitGradBoost(std::vector< const TMVA::Event * > &)
Initialize targets for first tree.
Double_t CheckEvent(const TMVA::Event *, Bool_t UseYesNoLeaf=kFALSE) const
the event e is put into the decision tree (starting at the root node) and the output is NodeType (sig...
Bool_t fNoNegWeightsInTraining
TString GetElapsedTime(Bool_t Scientific=kTRUE)
returns pretty string with elapsed time
UInt_t GetNVariables() const
const std::vector< Float_t > & GetRegressionValues()
Get the regression value generated by the BDTs.
void InitEventSample()
Initialize the event sample (i.e. reset the boost-weights... etc).
std::vector< Bool_t > fIsLowBkgCut
virtual void Delete(Option_t *option="")
Delete this object.
Bool_t HasTrainingTree() const
const Event * GetTrainingEvent(Long64_t ievt) const
VecExpr< UnaryOp< Fabs< T >, VecExpr< A, T, D >, T >, T, D > fabs(const VecExpr< A, T, D > &rhs)
virtual Int_t Write(const char *name=0, Int_t option=0, Int_t bufsize=0)
Write this object to the current directory.
Results * GetResults(const TString &, Types::ETreeType type, Types::EAnalysisType analysistype)
std::vector< Double_t > fLowBkgCut
Double_t fNodePurityLimit
Service class for 2-Dim histogram classes.
void SetBaggedSampleFraction(Double_t f)
std::map< TString, Double_t > optimize()
void BoostMonitor(Int_t iTree)
Fills the ROCIntegral vs Itree from the testSample for the monitoring plots during the training ...
The TMVA::Interval Class.
Double_t GetFisherCoeff(Int_t ivar) const
Bool_t fTrainWithNegWeights
Bool_t fSkipNormalization
ClassInfo * GetClassInfo(Int_t clNum) const
void DeleteResults(const TString &, Types::ETreeType type, Types::EAnalysisType analysistype)
delete the results stored for this particular Method instance.
virtual ~MethodBDT(void)
Destructor.
Implementation of the GiniIndex as separation criterion.
virtual void SetBinContent(Int_t bin, Double_t content)
Set bin content see convention for numbering bins in TH1::GetBin In case the bin number is greater th...
void SetNodePurityLimit(Double_t l)
Double_t PrivateGetMvaValue(const TMVA::Event *ev, Double_t *err=0, Double_t *errUpper=0, UInt_t useNTrees=0)
Return the MVA value (range [-1;1]) that classifies the event according to the majority vote from the...
Implementation of a Decision Tree.
const Event * GetEvent() const
char * Form(const char *fmt,...)
void SetTarget(UInt_t itgt, Float_t value)
set the target value (dimension itgt) to value
SeparationBase * fSepType
void Init(void)
Common initialisation with defaults for the BDT-Method.
void ReadWeightsFromXML(void *parent)
Reads the BDT from the xml file.
virtual void Print(Option_t *option="") const
Print TNamed name and title.
Double_t AdaBoost(std::vector< const TMVA::Event * > &, DecisionTree *dt)
The AdaBoost implementation.
Double_t TestTreeQuality(DecisionTree *dt)
Test the tree quality.. in terms of Misclassification.
Implementation of the GiniIndex With Laplace correction as separation criterion.
DecisionTree::EPruneMethod fPruneMethod
static void SetVarIndex(Int_t iVar)
void SetCurrentType(Types::ETreeType type) const
TGraph * GetGraph(const TString &alias) const
void ReadWeightsFromStream(std::istream &istr)
Read the weights (BDT coefficients).
Double_t ApplyPreselectionCuts(const Event *ev)
Apply the preselection cuts before even bothering about any Decision Trees in the GetMVA ...
void UpdateTargets(std::vector< const TMVA::Event * > &, UInt_t cls=0)
Calculate residual for all events.
std::vector< Float_t > * fMulticlassReturnVal
virtual Double_t GetROCIntegral(TH1D *histS, TH1D *histB) const
calculate the area (integral) under the ROC curve as a overall quality measure of the classification ...
void SetMaxDepth(Int_t d)
TDirectory * BaseDir() const
returns the ROOT directory where info/histograms etc of the corresponding MVA method instance are sto...
static DecisionTree * CreateFromXML(void *node, UInt_t tmva_Version_Code=TMVA_VERSION_CODE)
re-create a new tree (decision tree or search tree) from XML
static RooMathCoreReg dummy
void SetAdaBoostBeta(Double_t b)
Bool_t IsFloat() const
Returns kTRUE if string contains a floating point or integer number.
std::vector< const TMVA::Event * > * fTrainSample
you should not use this method at all Int_t Int_t Double_t Double_t Double_t e
DataSetInfo & DataInfo() const
VariableInfo & GetVariableInfo(Int_t i)
void AddPreDefVal(const T &)
const TString & GetInputLabel(Int_t i) const
The TMVA::Interval Class.
LossFunctionBDT * fRegressionLossFunctionBDTG
TMatrixTSym< Element > & Invert(Double_t *det=0)
Invert the matrix and calculate its determinant Notice that the LU decomposition is used instead of B...
Float_t GetTarget(UInt_t itgt) const
virtual std::map< TString, Double_t > OptimizeTuningParameters(TString fomType="ROCIntegral", TString fitType="FitGA")
Call the Optimizer with the set of parameters and ranges that are meant to be tuned.
virtual Int_t Branch(TCollection *list, Int_t bufsize=32000, Int_t splitlevel=99, const char *name="")
Create one branch for each element in the collection.
Short_t GetSelector() const
Bool_t DoRegression() const
TString fNegWeightTreatment
Abstract ClassifierFactory template that handles arbitrary types.
void GetBaggedSubSample(std::vector< const TMVA::Event * > &)
Fills fEventSample with fBaggedSampleFraction*NEvents random training events.
virtual void SetXTitle(const char *title)
virtual void SetPoint(Int_t i, Double_t x, Double_t y)
Set x and y values for point number i.
IPythonInteractive * fInteractive
virtual void AddRank(const Rank &rank)
Add a new rank take ownership of it.
UInt_t GetTrainingTMVAVersionCode() const
virtual void DeclareCompatibilityOptions()
options that are used ONLY for the READER to ensure backward compatibility they are hence without any...
Class that is the base-class for a vector of result.
Short_t Max(Short_t a, Short_t b)
const TString & GetOptions() const
A Graph is a graphics object made of two arrays X and Y with npoints each.
std::vector< const TMVA::Event * > fValidationSample
std::vector< DecisionTree * > fForest
void DrawProgressBar(Int_t, const TString &comment="")
draws progress bar in color or B&W caution:
std::vector< Double_t > GetVariableImportance()
Return the relative variable importance, normalized to all variables together having the importance 1...
Double_t fFValidationEvents
std::vector< Double_t > fLowSigCut
std::vector< Float_t > * fRegressionReturnVal
Long64_t GetNTrainingEvents() const
A TTree object has a header with a name and a title.
std::map< const TMVA::Event *, std::vector< double > > fResiduals
void Store(TObject *obj, const char *alias=0)
virtual void Init(std::map< const TMVA::Event *, LossFunctionEventInfo > &evinfomap, std::vector< double > &boostWeights)=0
Double_t Sqrt(Double_t x)
virtual void Set(Int_t n)
Set number of points in the graph Existing coordinates are preserved New coordinates above fNpoints a...
THist< 2, float, THistStatContent, THistStatUncertainty > TH2F
double norm(double *x, double *p)
Float_t GetResponse(void) const
std::vector< const TMVA::Event * > fSubSample
Timing information for training and evaluation of MVA methods.
void MakeClassSpecific(std::ostream &, const TString &) const
Make ROOT-independent C++ class for classifier response (classifier-specific implementation).
Analysis of Boosted Decision Trees.
Int_t CeilNint(Double_t x)
void NoErrorCalc(Double_t *const err, Double_t *const errUpper)
void SetSignalReferenceCut(Double_t cut)
std::vector< double > fBoostWeights
Float_t GetCutValue(void) const
MethodBDT(const TString &jobName, const TString &methodTitle, DataSetInfo &theData, const TString &theOption="")
The standard constructor for the "boosted decision trees".