140 using std::make_pair;
146 const
Int_t TMVA::MethodBDT::fgDebugLevel = 0;
159 , fSigToBkgFraction(0)
161 , fTransitionPoint(0)
164 , fBaggedGradBoost(kFALSE)
168 , fMinNodeSizeS("5%")
171 , fMinLinCorrForFisher(.8)
172 , fUseExclusiveVars(0)
173 , fUseYesNoLeaf(kFALSE)
174 , fNodePurityLimit(0)
179 , fFValidationEvents(0)
181 , fRandomisedTrees(kFALSE)
183 , fUsePoissonNvars(0)
184 , fUseNTrainEvents(0)
185 , fBaggedSampleFraction(0)
186 , fNoNegWeightsInTraining(kFALSE)
187 , fInverseBoostNegWeights(kFALSE)
188 , fPairNegWeightsGlobal(kFALSE)
189 , fTrainWithNegWeights(kFALSE)
190 , fDoBoostMonitor(kFALSE)
198 , fDoPreselection(kFALSE)
199 , fHistoricBool(kFALSE)
201 fMonitorNtuple =
NULL;
213 , fSigToBkgFraction(0)
215 , fTransitionPoint(0)
218 , fBaggedGradBoost(
kFALSE)
222 , fMinNodeSizeS(
"5%")
225 , fMinLinCorrForFisher(.8)
226 , fUseExclusiveVars(0)
228 , fNodePurityLimit(0)
233 , fFValidationEvents(0)
235 , fRandomisedTrees(
kFALSE)
237 , fUsePoissonNvars(0)
238 , fUseNTrainEvents(0)
239 , fBaggedSampleFraction(0)
240 , fNoNegWeightsInTraining(
kFALSE)
241 , fInverseBoostNegWeights(
kFALSE)
242 , fPairNegWeightsGlobal(
kFALSE)
243 , fTrainWithNegWeights(
kFALSE)
321 DeclareOptionRef(
fMinNodeSizeS=tmp,
"MinNodeSize",
"Minimum percentage of training events required in a leaf node (default: Classification: 5%, Regression: 0.2%)");
323 DeclareOptionRef(
fNCuts,
"nCuts",
"Number of grid points in variable range used in finding optimal cut in node splitting");
347 DeclareOptionRef(
fRandomisedTrees,
"UseRandomisedTrees",
"Determine at each node splitting the cut variable only as the best out of a random subset of variables (like in RandomForests)");
349 DeclareOptionRef(
fUsePoissonNvars,
"UsePoissonNvars",
"Interpret \"UseNvars\" not as fixed number but as mean of a Possion distribution in each split with RandomisedTree option");
350 DeclareOptionRef(
fBaggedSampleFraction=.6,
"BaggedSampleFraction",
"Relative size of bagged event sample to original size of the data sample (used whenever bagging is used (i.e. UseBaggedBoost, Bagging,)" );
353 "Use Sig or Bkg categories, or the purity=S/(S+B) as classification of the leaf node -> Real-AdaBoost");
358 DeclareOptionRef(
fNegWeightTreatment=
"InverseBoostNegWeights",
"NegWeightTreatment",
"How to treat events with negative weights in the BDT training (particular the boosting) : IgnoreInTraining; Boost With inverse boostweight; Pair events with negative and positive weights in traning sample and *annihilate* them (experimental!)");
400 DeclareOptionRef(
fPruneMethodS,
"PruneMethod",
"Note: for BDTs use small trees (e.g.MaxDepth=3) and NoPruning: Pruning: Method used for pruning (removal) of statistically insignificant branches ");
412 DeclareOptionRef(
fBaggedGradBoost=
kFALSE,
"UseBaggedGrad",
"deprecated: Use *UseBaggedBoost* instead: Use only a random subsample of all events for growing the trees in each iteration.");
413 DeclareOptionRef(
fBaggedSampleFraction,
"GradBaggingFraction",
"deprecated: Use *BaggedSampleFraction* instead: Defines the fraction of events to be used in each iteration, e.g. when UseBaggedGrad=kTRUE. ");
414 DeclareOptionRef(
fUseNTrainEvents,
"UseNTrainEvents",
"deprecated: Use *BaggedSampleFraction* instead: Number of randomly picked training events used in randomised (and bagged) trees");
428 "Use weighted trees or simple average in classification from the forest");
468 <<
"Sorry autmoatic pruning strength determination is not implemented yet for ExpectedErrorPruning" <<
Endl;
475 <<
"of events in a leaf node. This is DEPRECATED, please use the option \n" 476 <<
"*MinNodeSize* giving the relative number as percentage of training \n" 477 <<
"events instead. \n" 480 Log() <<
kWARNING <<
"Note also that explicitly setting *nEventsMin* so far OVERWRITES the option recomeded \n" 494 Log() <<
kWARNING <<
"the option *InverseBoostNegWeights* does not exist for BoostType=Grad --> change to *IgnoreNegWeightsInTraining*" <<
Endl;
507 Log() <<
kWARNING <<
"You have chosen to use more than half of your training sample " 508 <<
"to optimize the automatic pruning algorithm. This is probably wasteful " 509 <<
"and your overall results will be degraded. Are you sure you want this?" 514 if (this->
Data()->HasNegativeEventWeights()){
515 Log() <<
kINFO <<
" You are using a Monte Carlo that has also negative weights. " 516 <<
"That should in principle be fine as long as on average you end up with " 517 <<
"something positive. For this you have to make sure that the minimal number " 518 <<
"of (un-weighted) events demanded for a tree node (currently you use: MinNodeSize=" 520 <<
", (or the deprecated equivalent nEventsMin) you can set this via the " 521 <<
"BDT option string when booking the " 522 <<
"classifier) is large enough to allow for reasonable averaging!!! " 523 <<
" If this does not help.. maybe you want to try the option: IgnoreNegWeightsInTraining " 524 <<
"which ignores events with negative weight in the training. " <<
Endl 525 <<
Endl <<
"Note: You'll get a WARNING message during the training if that should ever happen" <<
Endl;
530 Log() <<
kWARNING <<
"Regression Trees do not work with fUseYesNoLeaf=TRUE --> I will set it to FALSE" <<
Endl;
535 Log() <<
kWARNING <<
"Regression Trees do not work with Separation type other than <RegressionVariance> --> I will use it instead" <<
Endl;
539 Log() <<
kWARNING <<
"Sorry, UseFisherCuts is not available for regression analysis, I will ignore it!" <<
Endl;
543 Log() <<
kWARNING <<
"Sorry, the option of nCuts<0 using a more elaborate node splitting algorithm " <<
Endl;
544 Log() <<
kWARNING <<
"is not implemented for regression analysis ! " <<
Endl;
545 Log() <<
kWARNING <<
"--> I switch do default nCuts = 20 and use standard node splitting"<<
Endl;
550 Log() <<
kINFO <<
" Randomised trees use no pruning" <<
Endl;
556 Log() <<
kWARNING <<
"Sorry, when using the option UseFisherCuts, the other option nCuts<0 (i.e. using" <<
Endl;
557 Log() <<
kWARNING <<
" a more elaborate node splitting algorithm) is not implemented. I will switch o " <<
Endl;
558 Log() <<
kWARNING <<
"--> I switch do default nCuts = 20 and use standard node splitting WITH possible Fisher criteria"<<
Endl;
563 Log() <<
kERROR <<
" Zero Decision Trees demanded... that does not work !! " 564 <<
" I set it to 1 .. just so that the program does not crash" 581 Log() <<
kWARNING <<
" you specified the option NegWeightTreatment=PairNegWeightsGlobal : This option is still considered EXPERIMENTAL !! " <<
Endl;
593 <<
"* \n this has been translated to MaxDepth="<<
fMaxDepth<<
Endl;
606 Log() <<
kWARNING <<
"You have specified a deprecated option *UseBaggedGrad* --> please use *UseBaggedBoost* instead" <<
Endl;
615 if (sizeInPercent > 0 && sizeInPercent < 50){
619 Log() <<
kFATAL <<
"you have demanded a minimal node size of " 620 << sizeInPercent <<
"% of the training events.. \n" 621 <<
" that somehow does not make sense "<<
Endl;
632 Log() <<
kFATAL <<
"I had problems reading the option MinNodeEvents, which " 633 <<
"after removing a possible % sign now reads " << sizeInPercent <<
Endl;
697 Log() <<
kDEBUG <<
" successfully(?) reset the method " <<
Endl;
727 std::vector<const TMVA::Event*> tmpEventSample;
728 for (
Long64_t ievt=0; ievt<nevents; ievt++) {
731 tmpEventSample.push_back(event);
737 for (
UInt_t i=0; i<tmpEventSample.size(); i++)
delete tmpEventSample[i];
742 for (
Long64_t ievt=0; ievt<nevents; ievt++) {
754 if (firstNegWeight) {
755 Log() <<
kWARNING <<
" Note, you have events with negative event weight in the sample, but you've chosen to ignore them" <<
Endl;
759 }
else if (event->GetWeight()==0){
760 if (firstZeroWeight) {
762 Log() <<
"Events with weight == 0 are going to be simply ignored " <<
Endl;
766 if (event->GetWeight() < 0) {
771 Log() <<
kWARNING <<
"Events with negative event weights are found and " 772 <<
" will be removed prior to the actual BDT training by global " 773 <<
" paring (and subsequent annihilation) with positiv weight events" 776 Log() <<
kWARNING <<
"Events with negative event weights are USED during " 777 <<
"the BDT training. This might cause problems with small node sizes " 778 <<
"or with the boosting. Please remove negative events from training " 779 <<
"using the option *IgnoreEventsWithNegWeightsInTraining* in case you " 780 <<
"observe problems with the boosting" 788 Int_t imodulo =
static_cast<Int_t>( fmod(modulo,1.0) > 0.5 ?
ceil(modulo) :
floor(modulo) );
802 <<
"% of training used for validation)" << Endl;
810 Log() <<
kINFO <<
"<InitEventSample> For classification trees, "<<
Endl;
811 Log() <<
kINFO <<
" the effective number of backgrounds is scaled to match "<<
Endl;
812 Log() <<
kINFO <<
" the signal. Othersise the first boosting step would do 'just that'!"<<
Endl;
828 Int_t sumSig=0, sumBkg=0;
838 if (sumSigW && sumBkgW){
841 Log() <<
kINFO <<
"re-normlise events such that Sig and Bkg have respective sum of weights = " 843 Log() <<
kINFO <<
" sig->sig*"<<normSig <<
"ev. bkg->bkg*"<<normBkg <<
"ev." <<
Endl;
844 Log() <<
kINFO <<
"#events: (reweighted) sig: "<< sumSigW*normSig <<
" bkg: " << sumBkgW*normBkg <<
Endl;
845 Log() <<
kINFO <<
"#events: (unweighted) sig: "<< sumSig <<
" bkg: " << sumBkg <<
Endl;
846 for (
Long64_t ievt=0; ievt<nevents; ievt++) {
851 Log() <<
kINFO <<
"--> could not determine scaleing factors as either there are " <<
Endl;
852 Log() <<
kINFO <<
" no signal events (sumSigW="<<sumSigW<<
") or no bkg ev. (sumBkgW="<<sumBkgW<<
")"<<
Endl;
886 std::vector<const Event*> negEvents;
896 if (totalNegWeights == 0 ) {
897 Log() <<
kINFO <<
"no negative event weights found .. no preprocessing necessary" <<
Endl;
900 Log() <<
kINFO <<
"found a total of " << totalNegWeights <<
" of negative event weights which I am going to try to pair with positive events to annihilate them" <<
Endl;
901 Log() <<
kINFO <<
"found a total of " << totalPosWeights <<
" of events with positive weights" <<
Endl;
902 Log() <<
kINFO <<
"--> total sum of weights = " << totalWeights <<
" = " << totalNegWeights+totalPosWeights <<
Endl;
909 for (
Int_t i=0; i<2; i++){
910 invCov = ((*cov)[i]);
912 std::cout <<
"<MethodBDT::PreProcessNeg...> matrix is almost singular with deterninant=" 914 <<
" did you use the variables that are linear combinations or highly correlated?" 918 std::cout <<
"<MethodBDT::PreProcessNeg...> matrix is singular with determinant=" 920 <<
" did you use the variables that are linear combinations?" 929 Log() <<
kINFO <<
"Found a total of " << totalNegWeights <<
" in negative weights out of " <<
fEventSample.size() <<
" training events " <<
Endl;
930 Timer timer(negEvents.size(),
"Negative Event paired");
931 for (
UInt_t nev = 0; nev < negEvents.size(); nev++){
932 timer.DrawProgressBar( nev );
933 Double_t weight = negEvents[nev]->GetWeight();
934 UInt_t iClassID = negEvents[nev]->GetClass();
935 invCov = ((*cov)[iClassID]);
946 dist += (negEvents[nev]->GetValue(ivar)-
fEventSample[iev]->GetValue(ivar))*
947 (*invCov)[ivar][jvar]*
948 (negEvents[nev]->GetValue(jvar)-
fEventSample[iev]->GetValue(jvar));
951 if (dist < minDist) { iMin=iev; minDist=
dist;}
959 negEvents[nev]->SetBoostWeight( 0 );
962 negEvents[nev]->SetBoostWeight( newWeight/negEvents[nev]->GetOriginalWeight() );
966 }
else Log() <<
kFATAL <<
"preprocessing didn't find event to pair with the negative weight ... probably a bug" <<
Endl;
967 weight = negEvents[nev]->GetWeight();
970 Log() <<
kINFO <<
"<Negative Event Pairing> took: " <<
timer.GetElapsedTime()
982 std::vector<const Event*> newEventSample;
1003 if (totalNegWeights < 0)
Log() <<
kFATAL <<
" compenstion of negative event weights with positive ones did not work " << totalNegWeights <<
Endl;
1008 Log() <<
kINFO <<
" after PreProcessing, the Event sample is left with " <<
fEventSample.size() <<
" events (unweighted), all with positive weights, adding up to " << totalWeights <<
Endl;
1009 Log() <<
kINFO <<
" nSig="<<nSig <<
" sigWeight="<<sigWeight <<
" nBkg="<<nBkg <<
" bkgWeight="<<bkgWeight <<
Endl;
1023 std::map<TString,TMVA::Interval*> tuneParameters;
1024 std::map<TString,Double_t> tunedParameters;
1033 tuneParameters.insert(std::pair<TString,Interval*>(
"NTrees",
new Interval(10,1000,5)));
1034 tuneParameters.insert(std::pair<TString,Interval*>(
"MaxDepth",
new Interval(2,4,3)));
1035 tuneParameters.insert(std::pair<TString,Interval*>(
"MinNodeSize",
new LogInterval(1,30,30)));
1041 tuneParameters.insert(std::pair<TString,Interval*>(
"AdaBoostBeta",
new Interval(.2,1.,5)));
1044 tuneParameters.insert(std::pair<TString,Interval*>(
"Shrinkage",
new Interval(0.05,0.50,5)));
1049 tuneParameters.insert(std::pair<TString,Interval*>(
"UseNvars",
new Interval(min_var,max_var,4)));
1053 Log()<<
kINFO <<
" the following BDT parameters will be tuned on the respective *grid*\n"<<
Endl;
1054 std::map<TString,TMVA::Interval*>::iterator it;
1055 for(it=tuneParameters.begin(); it!= tuneParameters.end(); it++){
1062 tunedParameters=optimize.
optimize();
1064 return tunedParameters;
1073 std::map<TString,Double_t>::iterator it;
1074 for(it=tuneParameters.begin(); it!= tuneParameters.end(); it++){
1077 else if (it->first ==
"MinNodeSize" )
SetMinNodeSize (it->second);
1081 else if (it->first ==
"Shrinkage" )
SetShrinkage (it->second);
1084 else Log() <<
kFATAL <<
" SetParameter for " << it->first <<
" not yet implemented " <<
Endl;
1103 Log() <<
kERROR <<
" Zero Decision Trees demanded... that does not work !! " 1104 <<
" I set it to 1 .. just so that the program does not crash" 1112 <<
"please remove the option from the configuration string, or " 1113 <<
"use \"!Normalise\"" 1128 TString hname =
"AdaBooost weight distribution";
1138 hname=
"Boost event weights distribution";
1143 TH1*
h =
new TH1F(
"BoostWeight",hname,nBins,xMin,xMax);
1144 TH1* nodesBeforePruningVsTree =
new TH1I(
"NodesBeforePruning",
"nodes before pruning",
fNTrees,0,
fNTrees);
1145 TH1* nodesAfterPruningVsTree =
new TH1I(
"NodesAfterPruning",
"nodes after pruning",
fNTrees,0,
fNTrees);
1153 results->
Store(h,
"BoostWeights");
1158 TH2* boostMonitor =
new TH2F(
"BoostMonitor",
"ROC Integral Vs iTree",2,0,
fNTrees,2,0,1.05);
1160 boostMonitor->
SetYTitle(
"ROC Integral");
1161 results->
Store(boostMonitor,
"BoostMonitor");
1163 boostMonitorGraph->
SetName(
"BoostMonitorGraph");
1164 boostMonitorGraph->
SetTitle(
"ROCIntegralVsNTrees");
1165 results->
Store(boostMonitorGraph,
"BoostMonitorGraph");
1172 results->
Store(h,
"BoostWeightsVsTree");
1178 results->
Store(h,
"ErrorFrac");
1181 nodesBeforePruningVsTree->
SetXTitle(
"#tree");
1182 nodesBeforePruningVsTree->
SetYTitle(
"#tree nodes");
1183 results->
Store(nodesBeforePruningVsTree);
1186 nodesAfterPruningVsTree->
SetXTitle(
"#tree");
1187 nodesAfterPruningVsTree->
SetYTitle(
"#tree nodes");
1188 results->
Store(nodesAfterPruningVsTree);
1198 Int_t nNodesBeforePruningCount = 0;
1199 Int_t nNodesAfterPruningCount = 0;
1201 Int_t nNodesBeforePruning = 0;
1202 Int_t nNodesAfterPruning = 0;
1212 while (itree <
fNTrees && continueBoost){
1226 Log() <<
kFATAL <<
"Multiclass is currently only supported by gradient boost. " 1227 <<
"Please change boost option accordingly (GradBoost)." 1231 for (
UInt_t i=0;i<nClasses;i++){
1237 fForest.back()->SetUseFisherCuts();
1261 fForest.back()->SetUseFisherCuts();
1269 nNodesBeforePruning =
fForest.back()->CleanTree();
1272 nNodesBeforePruningCount += nNodesBeforePruning;
1273 nodesBeforePruningVsTree->
SetBinContent(itree+1,nNodesBeforePruning);
1278 std::vector<const Event*> * validationSample =
NULL;
1300 nNodesAfterPruning =
fForest.back()->GetNNodes();
1301 nNodesAfterPruningCount += nNodesAfterPruning;
1302 nodesAfterPruningVsTree->
SetBinContent(itree+1,nNodesAfterPruning);
1308 if ( itree==
fNTrees-1 || (!(itree%500)) ||
1309 (!(itree%250) && itree <1000)||
1310 (!(itree%100) && itree < 500)||
1311 (!(itree%50) && itree < 250)||
1312 (!(itree%25) && itree < 150)||
1313 (!(itree%10) && itree < 50)||
1314 (!(itree%5) && itree < 20)
1326 Log() <<
kINFO <<
"<Train> average number of nodes (w/o pruning) : " 1330 Log() <<
kINFO <<
"<Train> average number of nodes before/after pruning : " 1331 << nNodesBeforePruningCount/
GetNTrees() <<
" / " 1340 Log() <<
kDEBUG <<
"Now I delete the privat data sample"<<
Endl;
1355 for (
UInt_t itree=0; itree<nTrees; itree++) {
1360 return 2.0/(1.0+
exp(-2.0*sum))-1;
1370 for (std::vector<const TMVA::Event*>::iterator e=eventSample.begin(); e!=eventSample.end();e++) {
1372 if(cls == nClasses-1){
1373 for(
UInt_t i=0;i<nClasses;i++){
1375 for(
UInt_t j=0;j<nClasses;j++){
1380 Double_t res = ((*e)->GetClass()==i)?(1.0-p_cls):(-p_cls);
1387 for (std::vector<const TMVA::Event*>::const_iterator e=eventSample.begin(); e!=eventSample.end();e++) {
1409 vector< std::pair<Double_t, Double_t> > temp;
1410 for (std::vector<const TMVA::Event*>::const_iterator e=eventSample.begin(); e!=eventSample.end();e++){
1417 for (std::vector<const TMVA::Event*>::const_iterator e=eventSample.begin(); e!=eventSample.end();e++) {
1432 std::sort(vec.begin(), vec.end());
1434 while(i<vec.size() && temp <= norm*quantile){
1435 temp += vec[i].second;
1438 if (i >= vec.size())
return 0.;
1439 return vec[i].first;
1447 std::map<TMVA::DecisionTreeNode*,std::vector<Double_t> > leaves;
1448 for (std::vector<const TMVA::Event*>::const_iterator e=eventSample.begin(); e!=eventSample.end();e++) {
1449 Double_t weight = (*e)->GetWeight();
1451 if ((leaves[node]).empty()){
1452 (leaves[node]).push_back((*e)->GetTarget(cls)* weight);
1453 (leaves[node]).push_back(
fabs((*e)->GetTarget(cls))*(1.0-
fabs((*e)->GetTarget(cls))) * weight* weight);
1456 (leaves[node])[0]+=((*e)->GetTarget(cls)* weight);
1457 (leaves[node])[1]+=
fabs((*e)->GetTarget(cls))*(1.0-
fabs((*e)->GetTarget(cls))) * weight* weight;
1461 iLeave!=leaves.end();++iLeave){
1462 if ((iLeave->second)[1]<1e-30) (iLeave->second)[1]=1e-30;
1478 std::map<TMVA::DecisionTreeNode*,Double_t > leaveWeights;
1479 std::map<TMVA::DecisionTreeNode*,vector< std::pair<Double_t, Double_t> > > leaves;
1481 for (std::vector<const TMVA::Event*>::const_iterator e=eventSample.begin(); e!=eventSample.end();e++) {
1484 (leaveWeights[node]) += (*e)->GetWeight();
1488 for (std::map<
TMVA::DecisionTreeNode*,vector< std::pair<Double_t, Double_t> > >::iterator iLeave=leaves.begin();
1489 iLeave!=leaves.end();++iLeave){
1492 for(
UInt_t j=0;j<((iLeave->second).size());j++){
1493 diff = (iLeave->second)[j].first-ResidualMedian;
1496 (iLeave->first)->SetResponse(
fShrinkage*(ResidualMedian+shift));
1510 std::vector<std::pair<Double_t, Double_t> > temp;
1512 for (std::vector<const TMVA::Event*>::const_iterator e=eventSample.begin(); e!=eventSample.end();e++) {
1521 std::map<const TMVA::Event*, std::pair<Double_t, Double_t> >::iterator res =
fWeightedResiduals.begin();
1524 (*res).second.first -= weightedMedian;
1533 for (std::vector<const TMVA::Event*>::const_iterator e=eventSample.begin(); e!=eventSample.end();e++) {
1534 for (
UInt_t i=0;i<nClasses;i++){
1536 Double_t r = (*e)->GetClass()==i?(1-1.0/nClasses):(-1.0/nClasses);
1543 for (std::vector<const TMVA::Event*>::const_iterator e=eventSample.begin(); e!=eventSample.end();e++) {
1568 return ncorrect / (ncorrect + nfalse);
1588 returnVal = this->
GradBoost (eventSample, dt, cls);
1590 returnVal = this->
GradBoost (eventSample, dt);
1613 TH1F *tmpS =
new TH1F(
"tmpS",
"", 100 , -1., 1.00001 );
1614 TH1F *tmpB =
new TH1F(
"tmpB",
"", 100 , -1., 1.00001 );
1629 for (
UInt_t iev=0; iev < nevents; iev++){
1632 if (event->GetClass() == signalClassNr) {tmp=tmpS;}
1638 std::vector<TH1F*> hS;
1639 std::vector<TH1F*> hB;
1641 hS.push_back(
new TH1F(
Form(
"SigVar%dAtTree%d",ivar,iTree),
Form(
"SigVar%dAtTree%d",ivar,iTree),100,
DataInfo().GetVariableInfo(ivar).GetMin(),
DataInfo().GetVariableInfo(ivar).GetMax()));
1642 hB.push_back(
new TH1F(
Form(
"BkgVar%dAtTree%d",ivar,iTree),
Form(
"BkgVar%dAtTree%d",ivar,iTree),100,
DataInfo().GetVariableInfo(ivar).GetMin(),
DataInfo().GetVariableInfo(ivar).GetMax()));
1643 results->
Store(hS.back(),hS.back()->GetTitle());
1644 results->
Store(hB.back(),hB.back()->GetTitle());
1651 TH1F *tmpBoostWeightsS =
new TH1F(
Form(
"BoostWeightsInTreeS%d",iTree),
Form(
"BoostWeightsInTreeS%d",iTree),100,0.,max);
1652 TH1F *tmpBoostWeightsB =
new TH1F(
Form(
"BoostWeightsInTreeB%d",iTree),
Form(
"BoostWeightsInTreeB%d",iTree),100,0.,max);
1653 results->
Store(tmpBoostWeightsS,tmpBoostWeightsS->
GetTitle());
1654 results->
Store(tmpBoostWeightsB,tmpBoostWeightsB->
GetTitle());
1656 TH1F *tmpBoostWeights;
1657 std::vector<TH1F*> *
h;
1661 tmpBoostWeights=tmpBoostWeightsS;
1664 tmpBoostWeights=tmpBoostWeightsB;
1705 Double_t err=0, sumGlobalw=0, sumGlobalwfalse=0, sumGlobalwfalse2=0;
1707 std::vector<Double_t> sumw(
DataInfo().GetNClasses(),0);
1708 std::map<Node*,Int_t> sigEventsInNode;
1711 for (std::vector<const TMVA::Event*>::const_iterator e=eventSample.begin(); e!=eventSample.end();e++) {
1714 UInt_t iclass=(*e)->GetClass();
1719 sumGlobalwfalse += w * tmpDev;
1720 sumGlobalwfalse2 += w * tmpDev*tmpDev;
1721 if (tmpDev > maxDev) maxDev = tmpDev;
1726 if (!(isSignalType ==
DataInfo().IsSignal(*e))) {
1727 sumGlobalwfalse+= w;
1734 sumGlobalwfalse+= w*trueType*dtoutput;
1739 err = sumGlobalwfalse/sumGlobalw ;
1743 err = sumGlobalwfalse/maxDev/sumGlobalw ;
1746 err = sumGlobalwfalse2/maxDev/maxDev/sumGlobalw ;
1750 for (std::vector<const TMVA::Event*>::const_iterator e=eventSample.begin(); e!=eventSample.end();e++) {
1753 err += w * (1 -
exp (-tmpDev/maxDev)) / sumGlobalw;
1758 Log() <<
kFATAL <<
" you've chosen a Loss type for Adaboost other than linear, quadratic or exponential " 1760 <<
"and this is not implemented... a typo in the options ??" <<
Endl;
1764 Log() <<
kDEBUG <<
"BDT AdaBoos wrong/all: " << sumGlobalwfalse <<
"/" << sumGlobalw <<
Endl;
1768 std::vector<Double_t> newSumw(sumw.size(),0);
1775 Log() <<
kERROR <<
" YOUR tree has only 1 Node... kind of a funny *tree*. I cannot " 1776 <<
"boost such a thing... if after 1 step the error rate is == 0.5" 1778 <<
"please check why this happens, maybe too many events per node requested ?" 1782 Log() <<
kERROR <<
" The error rate in the BDT boosting is > 0.5. ("<< err
1783 <<
") That should not happen, please check your code (i.e... the BDT code), I " 1784 <<
" stop boosting here" <<
Endl;
1788 }
else if (err < 0) {
1789 Log() <<
kERROR <<
" The error rate in the BDT boosting is < 0. That can happen" 1790 <<
" due to improper treatment of negative weights in a Monte Carlo.. (if you have" 1791 <<
" an idea on how to do it in a better way, please let me know (Helge.Voss@cern.ch)" 1792 <<
" for the time being I set it to its absolute value.. just to continue.." <<
Endl;
1801 Log() <<
kDEBUG <<
"BDT AdaBoos wrong/all: " << sumGlobalwfalse <<
"/" << sumGlobalw <<
" 1-err/err="<<boostWeight<<
" log.."<<
TMath::Log(boostWeight)<<
Endl;
1806 for (std::vector<const TMVA::Event*>::const_iterator e=eventSample.begin(); e!=eventSample.end();e++) {
1813 if ( (*e)->GetWeight() > 0 ){
1814 (*e)->SetBoostWeight( (*e)->GetBoostWeight() * boostfactor);
1819 else (*e)->SetBoostWeight( (*e)->GetBoostWeight() * boostfactor);
1831 if ( (*e)->GetWeight() > 0 ){
1832 (*e)->SetBoostWeight( (*e)->GetBoostWeight() * boostfactor);
1837 else (*e)->SetBoostWeight( (*e)->GetBoostWeight() * boostfactor);
1840 newSumGlobalw+=(*e)->GetWeight();
1841 newSumw[(*e)->GetClass()] += (*e)->GetWeight();
1847 Log() <<
kDEBUG <<
"new Nsig="<<newSumw[0]*globalNormWeight <<
" new Nbkg="<<newSumw[1]*globalNormWeight <<
Endl;
1850 for (std::vector<const TMVA::Event*>::const_iterator e=eventSample.begin(); e!=eventSample.end();e++) {
1855 else (*e)->ScaleBoostWeight( globalNormWeight );
1891 Double_t err=0, sumGlobalWeights=0, sumGlobalCost=0;
1893 std::vector<Double_t> sumw(
DataInfo().GetNClasses(),0);
1894 std::map<Node*,Int_t> sigEventsInNode;
1896 for (vector<const TMVA::Event*>::const_iterator e=eventSample.begin(); e!=eventSample.end();e++) {
1898 sumGlobalWeights += w;
1899 UInt_t iclass=(*e)->GetClass();
1904 Log() <<
kFATAL <<
" AdaCost not implemented for regression"<<
Endl;
1910 Bool_t isSelectedSignal = (dtoutput>0);
1911 if (isTrueSignal) trueType = 1;
1915 if (isTrueSignal && isSelectedSignal) cost=Css;
1916 else if (isTrueSignal && !isSelectedSignal) cost=Cts_sb;
1917 else if (!isTrueSignal && isSelectedSignal) cost=Ctb_ss;
1918 else if (!isTrueSignal && !isSelectedSignal) cost=Cbb;
1919 else Log() <<
kERROR <<
"something went wrong in AdaCost" <<
Endl;
1921 sumGlobalCost+= w*trueType*dtoutput*cost;
1927 Log() <<
kFATAL <<
" AdaCost not implemented for regression"<<
Endl;
1932 sumGlobalCost /= sumGlobalWeights;
1937 vector<Double_t> newSumClassWeights(sumw.size(),0);
1943 for (vector<const TMVA::Event*>::const_iterator e=eventSample.begin(); e!=eventSample.end();e++) {
1947 Bool_t isSelectedSignal = (dtoutput>0);
1948 if (isTrueSignal) trueType = 1;
1952 if (isTrueSignal && isSelectedSignal) cost=Css;
1953 else if (isTrueSignal && !isSelectedSignal) cost=Cts_sb;
1954 else if (!isTrueSignal && isSelectedSignal) cost=Ctb_ss;
1955 else if (!isTrueSignal && !isSelectedSignal) cost=Cbb;
1956 else Log() <<
kERROR <<
"something went wrong in AdaCost" <<
Endl;
1960 if ( (*e)->GetWeight() > 0 ){
1961 (*e)->SetBoostWeight( (*e)->GetBoostWeight() * boostfactor);
1968 newSumGlobalWeights+=(*e)->GetWeight();
1969 newSumClassWeights[(*e)->GetClass()] += (*e)->GetWeight();
1974 Double_t globalNormWeight=
Double_t(eventSample.size())/newSumGlobalWeights;
1975 Log() <<
kDEBUG <<
"new Nsig="<<newSumClassWeights[0]*globalNormWeight <<
" new Nbkg="<<newSumClassWeights[1]*globalNormWeight <<
Endl;
1978 for (std::vector<const TMVA::Event*>::const_iterator e=eventSample.begin(); e!=eventSample.end();e++) {
1982 else (*e)->ScaleBoostWeight( globalNormWeight );
2021 for (std::vector<const TMVA::Event*>::const_iterator e=eventSample.begin(); e!=eventSample.end();e++) {
2058 if ( !
DoRegression() )
Log() <<
kFATAL <<
"Somehow you chose a regression boost method for a classification job" <<
Endl;
2060 Double_t err=0, sumw=0, sumwfalse=0, sumwfalse2=0;
2062 for (std::vector<const TMVA::Event*>::const_iterator e=eventSample.begin(); e!=eventSample.end();e++) {
2067 sumwfalse += w * tmpDev;
2068 sumwfalse2 += w * tmpDev*tmpDev;
2069 if (tmpDev > maxDev) maxDev = tmpDev;
2074 err = sumwfalse/maxDev/sumw ;
2077 err = sumwfalse2/maxDev/maxDev/sumw ;
2081 for (std::vector<const TMVA::Event*>::const_iterator e=eventSample.begin(); e!=eventSample.end();e++) {
2084 err += w * (1 -
exp (-tmpDev/maxDev)) / sumw;
2089 Log() <<
kFATAL <<
" you've chosen a Loss type for Adaboost other than linear, quadratic or exponential " 2091 <<
"and this is not implemented... a typo in the options ??" <<
Endl;
2099 Log() <<
kERROR <<
" YOUR tree has only 1 Node... kind of a funny *tree*. I cannot " 2100 <<
"boost such a thing... if after 1 step the error rate is == 0.5" 2102 <<
"please check why this happens, maybe too many events per node requested ?" 2106 Log() <<
kERROR <<
" The error rate in the BDT boosting is > 0.5. ("<< err
2107 <<
") That should not happen, but is possible for regression trees, and" 2108 <<
" should trigger a stop for the boosting. please check your code (i.e... the BDT code), I " 2109 <<
" stop boosting " <<
Endl;
2113 }
else if (err < 0) {
2114 Log() <<
kERROR <<
" The error rate in the BDT boosting is < 0. That can happen" 2115 <<
" due to improper treatment of negative weights in a Monte Carlo.. (if you have" 2116 <<
" an idea on how to do it in a better way, please let me know (Helge.Voss@cern.ch)" 2117 <<
" for the time being I set it to its absolute value.. just to continue.." <<
Endl;
2121 Double_t boostWeight = err / (1.-err);
2126 for (std::vector<const TMVA::Event*>::const_iterator e=eventSample.begin(); e!=eventSample.end();e++) {
2128 results->
GetHist(
"BoostWeights")->
Fill(boostfactor);
2130 if ( (*e)->GetWeight() > 0 ){
2131 Float_t newBoostWeight = (*e)->GetBoostWeight() * boostfactor;
2132 Float_t newWeight = (*e)->GetWeight() * (*e)->GetBoostWeight() * boostfactor;
2133 if (newWeight == 0) {
2134 Log() <<
kINFO <<
"Weight= " << (*e)->GetWeight() <<
Endl;
2135 Log() <<
kINFO <<
"BoostWeight= " << (*e)->GetBoostWeight() <<
Endl;
2136 Log() <<
kINFO <<
"boostweight="<<boostWeight <<
" err= " <<err <<
Endl;
2137 Log() <<
kINFO <<
"NewBoostWeight= " << newBoostWeight <<
Endl;
2138 Log() <<
kINFO <<
"boostfactor= " << boostfactor <<
Endl;
2141 Log() <<
kINFO <<
"target = " << (*e)->GetTarget(0) <<
Endl;
2144 (*e)->SetBoostWeight( newBoostWeight );
2147 (*e)->SetBoostWeight( (*e)->GetBoostWeight() / boostfactor);
2149 newSumw+=(*e)->GetWeight();
2153 Double_t normWeight = sumw / newSumw;
2154 for (std::vector<const TMVA::Event*>::const_iterator e=eventSample.begin(); e!=eventSample.end();e++) {
2157 (*e)->SetBoostWeight( (*e)->GetBoostWeight() * normWeight );
2195 void* trxml =
fForest[i]->AddXMLTo(wght);
2215 if (
gTools().HasAttr( parent,
Form(
"PreselectionLowBkgVar%d",0))) {
2250 if(
gTools().HasAttr(parent,
"TreeType")) {
2261 fForest.back()->SetTreeID(i++);
2275 Int_t analysisType(0);
2279 Log() <<
kINFO <<
"Read " << fNTrees <<
" Decision trees" <<
Endl;
2287 istr >> dummy >> iTree >> dummy >> boostWeight;
2289 fForest.back()->Print( std::cout );
2290 Log() <<
kFATAL <<
"Error while reading weight file; mismatch iTree=" 2291 << iTree <<
" i=" << i
2292 <<
" dummy " << dummy
2293 <<
" boostweight " << boostWeight
2339 if (useNTrees > 0 ) nTrees = useNTrees;
2345 for (
UInt_t itree=0; itree<nTrees; itree++) {
2363 std::vector<double> temp;
2366 for(
UInt_t iClass=0; iClass<nClasses; iClass++){
2367 temp.push_back(0.0);
2368 for(
UInt_t itree = iClass; itree<
fForest.size(); itree+=nClasses){
2373 for(
UInt_t iClass=0; iClass<nClasses; iClass++){
2375 for(
UInt_t j=0;j<nClasses;j++){
2377 norm+=
exp(temp[j]-temp[iClass]);
2379 (*fMulticlassReturnVal).push_back(1.0/(1.0+norm));
2414 vector< Double_t > response(
fForest.size());
2415 vector< Double_t > weight(
fForest.size());
2424 std::vector< std::vector<Double_t> > vtemp;
2425 vtemp.push_back( response );
2426 vtemp.push_back( weight );
2431 while (sumOfWeights <= totalSumOfWeights/2.) {
2432 sumOfWeights += vtemp[1][t];
2502 for (
UInt_t i=0; i< relativeImportance.size(); i++) {
2524 if (ivar < (
UInt_t)relativeImportance.size())
return relativeImportance[ivar];
2525 else Log() <<
kFATAL <<
"<GetVariableImportance> ivar = " << ivar <<
" is out of range " <<
Endl;
2558 Log() <<
"Boosted Decision Trees are a collection of individual decision" <<
Endl;
2559 Log() <<
"trees which form a multivariate classifier by (weighted) majority " <<
Endl;
2560 Log() <<
"vote of the individual trees. Consecutive decision trees are " <<
Endl;
2561 Log() <<
"trained using the original training data set with re-weighted " <<
Endl;
2562 Log() <<
"events. By default, the AdaBoost method is employed, which gives " <<
Endl;
2563 Log() <<
"events that were misclassified in the previous tree a larger " <<
Endl;
2564 Log() <<
"weight in the training of the following tree." <<
Endl;
2566 Log() <<
"Decision trees are a sequence of binary splits of the data sample" <<
Endl;
2567 Log() <<
"using a single descriminant variable at a time. A test event " <<
Endl;
2568 Log() <<
"ending up after the sequence of left-right splits in a final " <<
Endl;
2569 Log() <<
"(\"leaf\") node is classified as either signal or background" <<
Endl;
2570 Log() <<
"depending on the majority type of training events in that node." <<
Endl;
2574 Log() <<
"By the nature of the binary splits performed on the individual" <<
Endl;
2575 Log() <<
"variables, decision trees do not deal well with linear correlations" <<
Endl;
2576 Log() <<
"between variables (they need to approximate the linear split in" <<
Endl;
2577 Log() <<
"the two dimensional space by a sequence of splits on the two " <<
Endl;
2578 Log() <<
"variables individually). Hence decorrelation could be useful " <<
Endl;
2579 Log() <<
"to optimise the BDT performance." <<
Endl;
2583 Log() <<
"The two most important parameters in the configuration are the " <<
Endl;
2584 Log() <<
"minimal number of events requested by a leaf node as percentage of the " <<
Endl;
2585 Log() <<
" number of training events (option \"MinNodeSize\" replacing the actual number " <<
Endl;
2586 Log() <<
" of events \"nEventsMin\" as given in earlier versions" <<
Endl;
2587 Log() <<
"If this number is too large, detailed features " <<
Endl;
2588 Log() <<
"in the parameter space are hard to be modelled. If it is too small, " <<
Endl;
2589 Log() <<
"the risk to overtrain rises and boosting seems to be less effective" <<
Endl;
2590 Log() <<
" typical values from our current expericience for best performance " <<
Endl;
2591 Log() <<
" are between 0.5(%) and 10(%) " <<
Endl;
2593 Log() <<
"The default minimal number is currently set to " <<
Endl;
2594 Log() <<
" max(20, (N_training_events / N_variables^2 / 10)) " <<
Endl;
2595 Log() <<
"and can be changed by the user." <<
Endl;
2597 Log() <<
"The other crucial parameter, the pruning strength (\"PruneStrength\")," <<
Endl;
2598 Log() <<
"is also related to overtraining. It is a regularisation parameter " <<
Endl;
2599 Log() <<
"that is used when determining after the training which splits " <<
Endl;
2600 Log() <<
"are considered statistically insignificant and are removed. The" <<
Endl;
2601 Log() <<
"user is advised to carefully watch the BDT screen output for" <<
Endl;
2602 Log() <<
"the comparison between efficiencies obtained on the training and" <<
Endl;
2603 Log() <<
"the independent test sample. They should be equal within statistical" <<
Endl;
2604 Log() <<
"errors, in order to minimize statistical fluctuations in different samples." <<
Endl;
2616 fout <<
" std::vector<"<<nodeName<<
"*> fForest; // i.e. root nodes of decision trees" << std::endl;
2617 fout <<
" std::vector<double> fBoostWeights; // the weights applied in the individual boosts" << std::endl;
2618 fout <<
"};" << std::endl << std::endl;
2619 fout <<
"double " << className <<
"::GetMvaValue__( const std::vector<double>& inputValues ) const" << std::endl;
2620 fout <<
"{" << std::endl;
2621 fout <<
" double myMVA = 0;" << std::endl;
2625 fout <<
" if (inputValues["<<ivar<<
"] < " <<
fLowBkgCut[ivar] <<
") return -1; // is background preselection cut" << std::endl;
2628 fout <<
" if (inputValues["<<ivar<<
"] < "<<
fLowSigCut[ivar] <<
") return 1; // is signal preselection cut" << std::endl;
2631 fout <<
" if (inputValues["<<ivar<<
"] > "<<
fHighBkgCut[ivar] <<
") return -1; // is background preselection cut" << std::endl;
2634 fout <<
" if (inputValues["<<ivar<<
"] > "<<
fHighSigCut[ivar]<<
") return 1; // is signal preselection cut" << std::endl;
2640 fout <<
" double norm = 0;" << std::endl;
2642 fout <<
" for (unsigned int itree=0; itree<fForest.size(); itree++){" << std::endl;
2643 fout <<
" "<<nodeName<<
" *current = fForest[itree];" << std::endl;
2644 fout <<
" while (current->GetNodeType() == 0) { //intermediate node" << std::endl;
2645 fout <<
" if (current->GoesRight(inputValues)) current=("<<nodeName<<
"*)current->GetRight();" << std::endl;
2646 fout <<
" else current=("<<nodeName<<
"*)current->GetLeft();" << std::endl;
2647 fout <<
" }" << std::endl;
2649 fout <<
" myMVA += current->GetResponse();" << std::endl;
2651 if (
fUseYesNoLeaf) fout <<
" myMVA += fBoostWeights[itree] * current->GetNodeType();" << std::endl;
2652 else fout <<
" myMVA += fBoostWeights[itree] * current->GetPurity();" << std::endl;
2653 fout <<
" norm += fBoostWeights[itree];" << std::endl;
2655 fout <<
" }" << std::endl;
2657 fout <<
" return 2.0/(1.0+exp(-2.0*myMVA))-1.0;" << std::endl;
2659 else fout <<
" return myMVA /= norm;" << std::endl;
2660 fout <<
"};" << std::endl << std::endl;
2661 fout <<
"void " << className <<
"::Initialize()" << std::endl;
2662 fout <<
"{" << std::endl;
2665 fout <<
" // itree = " << itree << std::endl;
2666 fout <<
" fBoostWeights.push_back(" <<
fBoostWeights[itree] <<
");" << std::endl;
2667 fout <<
" fForest.push_back( " << std::endl;
2669 fout <<
" );" << std::endl;
2671 fout <<
" return;" << std::endl;
2672 fout <<
"};" << std::endl;
2673 fout <<
" " << std::endl;
2674 fout <<
"// Clean up" << std::endl;
2675 fout <<
"inline void " << className <<
"::Clear() " << std::endl;
2676 fout <<
"{" << std::endl;
2677 fout <<
" for (unsigned int itree=0; itree<fForest.size(); itree++) { " << std::endl;
2678 fout <<
" delete fForest[itree]; " << std::endl;
2679 fout <<
" }" << std::endl;
2680 fout <<
"}" << std::endl;
2692 fout <<
"#define NN new "<<nodeName << std::endl;
2694 fout <<
" " << std::endl;
2695 fout <<
"#ifndef "<<nodeName<<
"__def" << std::endl;
2696 fout <<
"#define "<<nodeName<<
"__def" << std::endl;
2697 fout <<
" " << std::endl;
2698 fout <<
"class "<<nodeName<<
" {" << std::endl;
2699 fout <<
" " << std::endl;
2700 fout <<
"public:" << std::endl;
2701 fout <<
" " << std::endl;
2702 fout <<
" // constructor of an essentially \"empty\" node floating in space" << std::endl;
2703 fout <<
" "<<nodeName<<
" ( "<<nodeName<<
"* left,"<<nodeName<<
"* right," << std::endl;
2705 fout <<
" int nFisherCoeff," << std::endl;
2707 fout <<
" double fisherCoeff"<<i<<
"," << std::endl;
2710 fout <<
" int selector, double cutValue, bool cutType, " << std::endl;
2711 fout <<
" int nodeType, double purity, double response ) :" << std::endl;
2712 fout <<
" fLeft ( left )," << std::endl;
2713 fout <<
" fRight ( right )," << std::endl;
2714 if (
fUseFisherCuts) fout <<
" fNFisherCoeff ( nFisherCoeff )," << std::endl;
2715 fout <<
" fSelector ( selector )," << std::endl;
2716 fout <<
" fCutValue ( cutValue )," << std::endl;
2717 fout <<
" fCutType ( cutType )," << std::endl;
2718 fout <<
" fNodeType ( nodeType )," << std::endl;
2719 fout <<
" fPurity ( purity )," << std::endl;
2720 fout <<
" fResponse ( response ){" << std::endl;
2723 fout <<
" fFisherCoeff.push_back(fisherCoeff"<<i<<
");" << std::endl;
2726 fout <<
" }" << std::endl << std::endl;
2727 fout <<
" virtual ~"<<nodeName<<
"();" << std::endl << std::endl;
2728 fout <<
" // test event if it decends the tree at this node to the right" << std::endl;
2729 fout <<
" virtual bool GoesRight( const std::vector<double>& inputValues ) const;" << std::endl;
2730 fout <<
" "<<nodeName<<
"* GetRight( void ) {return fRight; };" << std::endl << std::endl;
2731 fout <<
" // test event if it decends the tree at this node to the left " << std::endl;
2732 fout <<
" virtual bool GoesLeft ( const std::vector<double>& inputValues ) const;" << std::endl;
2733 fout <<
" "<<nodeName<<
"* GetLeft( void ) { return fLeft; }; " << std::endl << std::endl;
2734 fout <<
" // return S/(S+B) (purity) at this node (from training)" << std::endl << std::endl;
2735 fout <<
" double GetPurity( void ) const { return fPurity; } " << std::endl;
2736 fout <<
" // return the node type" << std::endl;
2737 fout <<
" int GetNodeType( void ) const { return fNodeType; }" << std::endl;
2738 fout <<
" double GetResponse(void) const {return fResponse;}" << std::endl << std::endl;
2739 fout <<
"private:" << std::endl << std::endl;
2740 fout <<
" "<<nodeName<<
"* fLeft; // pointer to the left daughter node" << std::endl;
2741 fout <<
" "<<nodeName<<
"* fRight; // pointer to the right daughter node" << std::endl;
2743 fout <<
" int fNFisherCoeff; // =0 if this node doesn use fisher, else =nvar+1 " << std::endl;
2744 fout <<
" std::vector<double> fFisherCoeff; // the fisher coeff (offset at the last element)" << std::endl;
2746 fout <<
" int fSelector; // index of variable used in node selection (decision tree) " << std::endl;
2747 fout <<
" double fCutValue; // cut value appplied on this node to discriminate bkg against sig" << std::endl;
2748 fout <<
" bool fCutType; // true: if event variable > cutValue ==> signal , false otherwise" << std::endl;
2749 fout <<
" int fNodeType; // Type of node: -1 == Bkg-leaf, 1 == Signal-leaf, 0 = internal " << std::endl;
2750 fout <<
" double fPurity; // Purity of node from training"<< std::endl;
2751 fout <<
" double fResponse; // Regression response value of node" << std::endl;
2752 fout <<
"}; " << std::endl;
2753 fout <<
" " << std::endl;
2754 fout <<
"//_______________________________________________________________________" << std::endl;
2755 fout <<
" "<<nodeName<<
"::~"<<nodeName<<
"()" << std::endl;
2756 fout <<
"{" << std::endl;
2757 fout <<
" if (fLeft != NULL) delete fLeft;" << std::endl;
2758 fout <<
" if (fRight != NULL) delete fRight;" << std::endl;
2759 fout <<
"}; " << std::endl;
2760 fout <<
" " << std::endl;
2761 fout <<
"//_______________________________________________________________________" << std::endl;
2762 fout <<
"bool "<<nodeName<<
"::GoesRight( const std::vector<double>& inputValues ) const" << std::endl;
2763 fout <<
"{" << std::endl;
2764 fout <<
" // test event if it decends the tree at this node to the right" << std::endl;
2765 fout <<
" bool result;" << std::endl;
2767 fout <<
" if (fNFisherCoeff == 0){" << std::endl;
2768 fout <<
" result = (inputValues[fSelector] > fCutValue );" << std::endl;
2769 fout <<
" }else{" << std::endl;
2770 fout <<
" double fisher = fFisherCoeff.at(fFisherCoeff.size()-1);" << std::endl;
2771 fout <<
" for (unsigned int ivar=0; ivar<fFisherCoeff.size()-1; ivar++)" << std::endl;
2772 fout <<
" fisher += fFisherCoeff.at(ivar)*inputValues.at(ivar);" << std::endl;
2773 fout <<
" result = fisher > fCutValue;" << std::endl;
2774 fout <<
" }" << std::endl;
2776 fout <<
" result = (inputValues[fSelector] > fCutValue );" << std::endl;
2778 fout <<
" if (fCutType == true) return result; //the cuts are selecting Signal ;" << std::endl;
2779 fout <<
" else return !result;" << std::endl;
2780 fout <<
"}" << std::endl;
2781 fout <<
" " << std::endl;
2782 fout <<
"//_______________________________________________________________________" << std::endl;
2783 fout <<
"bool "<<nodeName<<
"::GoesLeft( const std::vector<double>& inputValues ) const" << std::endl;
2784 fout <<
"{" << std::endl;
2785 fout <<
" // test event if it decends the tree at this node to the left" << std::endl;
2786 fout <<
" if (!this->GoesRight(inputValues)) return true;" << std::endl;
2787 fout <<
" else return false;" << std::endl;
2788 fout <<
"}" << std::endl;
2789 fout <<
" " << std::endl;
2790 fout <<
"#endif" << std::endl;
2791 fout <<
" " << std::endl;
2800 Log() <<
kFATAL <<
"MakeClassInstantiateNode: started with undefined node" <<
Endl;
2803 fout <<
"NN("<<std::endl;
2810 fout <<
", " <<std::endl;
2817 fout <<
", " << std::endl
2818 << std::setprecision(6);
2846 Int_t nTotS_unWeighted = 0, nTotB_unWeighted = 0;
2848 std::vector<TMVA::BDTEventWrapper> bdtEventSample;
2863 for( std::vector<const TMVA::Event*>::const_iterator it = eventSample.begin(); it != eventSample.end(); ++it ) {
2865 nTotS += (*it)->GetWeight();
2869 nTotB += (*it)->GetWeight();
2877 std::sort( bdtEventSample.begin(),bdtEventSample.end() );
2879 Double_t bkgWeightCtr = 0.0, sigWeightCtr = 0.0;
2880 std::vector<TMVA::BDTEventWrapper>::iterator it = bdtEventSample.begin(), it_end = bdtEventSample.end();
2881 for( ; it != it_end; ++it ) {
2883 sigWeightCtr += (**it)->GetWeight();
2885 bkgWeightCtr += (**it)->GetWeight();
2887 it->SetCumulativeWeight(
false,bkgWeightCtr);
2888 it->SetCumulativeWeight(
true,sigWeightCtr);
2894 Double_t nSelS, nSelB, effS=0.05, effB=0.05, rejS=0.05, rejB=0.05;
2895 Double_t tmpEffS, tmpEffB, tmpRejS, tmpRejB;
2900 for(
UInt_t iev = 1; iev < bdtEventSample.size(); iev++) {
2903 nSelS = bdtEventSample[iev].GetCumulativeWeight(
true);
2904 nSelB = bdtEventSample[iev].GetCumulativeWeight(
false);
2906 tmpEffS=nSelS/nTotS;
2907 tmpEffB=nSelB/nTotB;
2911 else if (nSelB==0 && tmpEffS>effS) {effS=tmpEffS;
fLowSigCut[ivar] = bdtEventSample[iev].GetVal() - dVal;
fIsLowSigCut[ivar]=
kTRUE;}
2918 Log() <<
kINFO <<
" found and suggest the following possible pre-selection cuts " <<
Endl;
2919 if (
fDoPreselection)
Log() <<
kINFO <<
"the training will be done after these cuts... and GetMVA value returns +1, (-1) for a signal (bkg) event that passes these cuts" <<
Endl;
2920 else Log() <<
kINFO <<
"as option DoPreselection was not used, these cuts however will not be performed, but the training will see the full sample"<<
Endl;
Types::EAnalysisType fAnalysisType
void Train(void)
BDT training.
void PreProcessNegativeEventWeights()
o.k.
virtual Int_t Fill(Double_t x)
Increment bin with abscissa X by 1.
double dist(Rotation3D const &r1, Rotation3D const &r2)
void GetBaggedSubSample(std::vector< const TMVA::Event *> &)
fills fEventSample with fBaggedSampleFraction*NEvents random training events
Random number generator class based on M.
virtual Double_t PoissonD(Double_t mean)
Generates a random number according to a Poisson law.
MsgLogger & Endl(MsgLogger &ml)
std::vector< Bool_t > fIsLowSigCut
Double_t RegBoost(std::vector< const TMVA::Event *> &, DecisionTree *dt)
a special boosting only for Regression ...
void DeclareCompatibilityOptions()
options that are used ONLY for the READER to ensure backward compatibility
Bool_t fPairNegWeightsGlobal
void SetUseNvars(Int_t n)
const Ranking * CreateRanking()
Compute ranking of input variables.
virtual void Delete(Option_t *option="")
Delete this tree from memory or/and disk.
Bool_t IsConstructedFromWeightFile() const
Double_t CheckEvent(const TMVA::Event *, Bool_t UseYesNoLeaf=kFALSE) const
the event e is put into the decision tree (starting at the root node) and the output is NodeType (sig...
TString & ReplaceAll(const TString &s1, const TString &s2)
virtual Int_t Fill()
Fill all branches.
virtual void SetName(const char *name)
Change (i.e.
std::vector< Bool_t > fIsHighSigCut
OptionBase * DeclareOptionRef(T &ref, const TString &name, const TString &desc="")
void DeclareOptions()
define the options (their key words) that can be set in the option string know options: nTrees number...
std::vector< Double_t > fVariableImportance
Bool_t IsFloat() const
Returns kTRUE if string contains a floating point or integer number.
void DeterminePreselectionCuts(const std::vector< const TMVA::Event *> &eventSample)
find useful preselection cuts that will be applied before and Decision Tree training.
void MakeClassInstantiateNode(DecisionTreeNode *n, std::ostream &fout, const TString &className) const
recursively descends a tree and writes the node instance to the output streem
Double_t fMinLinCorrForFisher
std::vector< const TMVA::Event * > fEventSample
Double_t Bagging()
call it boot-strapping, re-sampling or whatever you like, in the end it is nothing else but applying ...
1-D histogram with a float per channel (see TH1 documentation)}
TransformationHandler & GetTransformationHandler(Bool_t takeReroutedIfAvailable=true)
Short_t Min(Short_t a, Short_t b)
void ToLower()
Change string to lower-case.
std::map< const TMVA::Event *, std::pair< Double_t, Double_t > > fWeightedResiduals
virtual void SetYTitle(const char *title)
virtual void SetTitle(const char *title="")
Set graph title.
Double_t AdaBoost(std::vector< const TMVA::Event *> &, DecisionTree *dt)
the AdaBoost implementation.
UInt_t GetNClasses() const
void ProcessOptions()
the option string is decoded, for available options see "DeclareOptions"
Int_t FloorNint(Double_t x)
virtual void Print(Option_t *option="") const
This method must be overridden when a class wants to print itself.
Double_t GetWeightedQuantile(std::vector< std::pair< Double_t, Double_t > > vec, const Double_t quantile, const Double_t SumOfWeights=0.0)
calculates the quantile of the distribution of the first pair entries weighted with the values in the...
void GetHelpMessage() const
Get help message text.
Bool_t GetCutType(void) const
std::vector< Bool_t > fIsHighBkgCut
void SetShrinkage(Double_t s)
Double_t AdaCost(std::vector< const TMVA::Event *> &, DecisionTree *dt)
the AdaCost boosting algorithm takes a simple cost Matrix (currently fixed for all events...
void MakeClassSpecific(std::ostream &, const TString &) const
make ROOT-independent C++ class for classifier response (classifier-specific implementation) ...
virtual Double_t GetROCIntegral(TH1D *histS, TH1D *histB) const
calculate the area (integral) under the ROC curve as a overall quality measure of the classification ...
Double_t GetMvaValue(Double_t *err=0, Double_t *errUpper=0)
LongDouble_t Power(LongDouble_t x, LongDouble_t y)
const TString & GetInputLabel(Int_t i) const
std::vector< Double_t > fHighBkgCut
Double_t GetGradBoostMVA(const TMVA::Event *e, UInt_t nTrees)
returns MVA value: -1 for background, 1 for signal
Double_t fBaggedSampleFraction
Bool_t fInverseBoostNegWeights
Double_t GradBoostRegression(std::vector< const TMVA::Event *> &, DecisionTree *dt)
Implementation of M_TreeBoost using a Huber loss function as desribed by Friedman 1999...
virtual void SetTuneParameters(std::map< TString, Double_t > tuneParameters)
set the tuning parameters accoding to the argument
void MakeClassSpecificHeader(std::ostream &, const TString &) const
specific class header
Float_t GetCutValue(void) const
UInt_t GetTrainingTMVAVersionCode() const
const Event * GetEvent() const
Double_t fSigToBkgFraction
virtual Bool_t HasAnalysisType(Types::EAnalysisType type, UInt_t numberClasses, UInt_t numberTargets)
BDT can handle classification with multiple classes and regression with one regression-target.
UInt_t GetNFisherCoeff() const
void Reset(void)
reset the method, as if it had just been instantiated (forget all training etc.)
TString & Append(const char *cs)
void SetMinNodeSize(Double_t sizeInPercent)
DataSetInfo & DataInfo() const
Bool_t DoRegression() const
Double_t AdaBoostR2(std::vector< const TMVA::Event *> &, DecisionTree *dt)
adaption of the AdaBoost to regression problems (see H.Drucker 1997)
std::vector< Double_t > fHighSigCut
Double_t fTransitionPoint
Long64_t GetNTrainingEvents() const
const std::vector< Float_t > & GetMulticlassValues()
get the multiclass MVA response for the BDT classifier
const Event * GetTrainingEvent(Long64_t ievt) const
Bool_t fNoNegWeightsInTraining
TString GetElapsedTime(Bool_t Scientific=kTRUE)
Bool_t DoMulticlass() const
const std::vector< Float_t > & GetRegressionValues()
get the regression value generated by the BDTs
void InitEventSample()
initialize the event sample (i.e. reset the boost-weights... etc)
std::vector< Bool_t > fIsLowBkgCut
void WriteMonitoringHistosToFile(void) const
Here we could write some histograms created during the processing to the output file.
virtual void Delete(Option_t *option="")
Delete this object.
VecExpr< UnaryOp< Fabs< T >, VecExpr< A, T, D >, T >, T, D > fabs(const VecExpr< A, T, D > &rhs)
const Event * GetTestingEvent(Long64_t ievt) const
virtual Double_t Determinant() const
virtual Int_t Write(const char *name=0, Int_t option=0, Int_t bufsize=0)
Write this object to the current directory.
Float_t GetTarget(UInt_t itgt) const
Bool_t HasTrainingTree() const
Results * GetResults(const TString &, Types::ETreeType type, Types::EAnalysisType analysistype)
TString info(resultsName+"/"); switch(type) { case Types::kTraining: info += "kTraining/"; break; cas...
std::vector< Double_t > fLowBkgCut
Int_t GetNodeType(void) const
Double_t fNodePurityLimit
Service class for 2-Dim histogram classes.
void SetBaggedSampleFraction(Double_t f)
const char * GetName() const
Returns name of object.
ClassInfo * GetClassInfo(Int_t clNum) const
std::map< TString, Double_t > optimize()
TGraph * GetGraph(const TString &alias) const
void BoostMonitor(Int_t iTree)
fills the ROCIntegral vs Itree from the testSample for the monitoring plots during the training ...
Double_t GetFisherCoeff(Int_t ivar) const
Bool_t fTrainWithNegWeights
void DeleteResults(const TString &, Types::ETreeType type, Types::EAnalysisType analysistype)
delete the results stored for this particulary Method instance (here appareantly called resultsName i...
virtual ~MethodBDT(void)
destructor Note: fEventSample and ValidationSample are already deleted at the end of TRAIN When they ...
virtual void SetBinContent(Int_t bin, Double_t content)
Set bin content see convention for numbering bins in TH1::GetBin In case the bin number is greater th...
void SetNodePurityLimit(Double_t l)
Double_t PrivateGetMvaValue(const TMVA::Event *ev, Double_t *err=0, Double_t *errUpper=0, UInt_t useNTrees=0)
Return the MVA value (range [-1;1]) that classifies the event according to the majority vote from the...
void BDT(const TString &fin="TMVA.root")
Double_t GradBoost(std::vector< const TMVA::Event *> &, DecisionTree *dt, UInt_t cls=0)
Calculate the desired response value for each region.
char * Form(const char *fmt,...)
const TString & GetMethodName() const
void SetTarget(UInt_t itgt, Float_t value)
set the target value (dimension itgt) to value
SeparationBase * fSepType
void Init(void)
common initialisation with defaults for the BDT-Method
void ReadWeightsFromXML(void *parent)
reads the BDT from the xml file
TMVA::DecisionTreeNode * GetEventNode(const TMVA::Event &e) const
get the pointer to the leaf node where a particular event ends up in...
virtual const char * GetPath() const
Returns the full path of the directory.
Double_t TestTreeQuality(DecisionTree *dt)
test the tree quality.. in terms of Miscalssification
Long64_t GetNTestEvents() const
UInt_t GetNVariables() const
Float_t GetValue(UInt_t ivar) const
return value of i'th variable
DecisionTree::EPruneMethod fPruneMethod
static void SetVarIndex(Int_t iVar)
Float_t GetPurity(void) const
Bool_t IgnoreEventsWithNegWeightsInTraining() const
void ReadWeightsFromStream(std::istream &istr)
read the weights (BDT coefficients)
Double_t ApplyPreselectionCuts(const Event *ev)
aply the preselection cuts before even bothing about any Decision Trees in the GetMVA ...
void UpdateTargets(std::vector< const TMVA::Event *> &, UInt_t cls=0)
Calculate residua for all events;.
Describe directory structure in memory.
std::vector< Float_t > * fMulticlassReturnVal
Bool_t IsNormalised() const
void SetMaxDepth(Int_t d)
TH1 * GetHist(const TString &alias) const
void AddWeightsXMLTo(void *parent) const
write weights to XML
static DecisionTree * CreateFromXML(void *node, UInt_t tmva_Version_Code=TMVA_VERSION_CODE)
re-create a new tree (decision tree or search tree) from XML
static RooMathCoreReg dummy
void SetAdaBoostBeta(Double_t b)
void SetCurrentType(Types::ETreeType type) const
std::vector< const TMVA::Event * > * fTrainSample
VariableInfo & GetVariableInfo(Int_t i)
void AddPreDefVal(const T &)
Double_t Boost(std::vector< const TMVA::Event *> &, DecisionTree *dt, UInt_t cls=0)
apply the boosting alogrithim (the algorithm is selecte via the the "option" given in the constructor...
static Vc_ALWAYS_INLINE int_v max(const int_v &x, const int_v &y)
const TString & GetOptions() const
TMatrixTSym< Element > & Invert(Double_t *det=0)
Invert the matrix and calculate its determinant Notice that the LU decomposition is used instead of B...
virtual std::map< TString, Double_t > OptimizeTuningParameters(TString fomType="ROCIntegral", TString fitType="FitGA")
call the Optimzier with the set of paremeters and ranges that are meant to be tuned.
virtual Int_t Branch(TCollection *list, Int_t bufsize=32000, Int_t splitlevel=99, const char *name="")
Create one branch for each element in the collection.
#define REGISTER_METHOD(CLASS)
for example
TString fNegWeightTreatment
Abstract ClassifierFactory template that handles arbitrary types.
virtual void SetXTitle(const char *title)
virtual void SetPoint(Int_t i, Double_t x, Double_t y)
Set x and y values for point number i.
MethodBDT(const TString &jobName, const TString &methodTitle, DataSetInfo &theData, const TString &theOption="", TDirectory *theTargetDir=0)
the standard constructor for the "boosted decision trees"
TDirectory * BaseDir() const
returns the ROOT directory where info/histograms etc of the corresponding MVA method instance are sto...
Float_t GetResponse(void) const
virtual void AddRank(const Rank &rank)
Add a new rank take ownership of it.
virtual void DeclareCompatibilityOptions()
options that are used ONLY for the READER to ensure backward compatibility they are hence without any...
Short_t Max(Short_t a, Short_t b)
A Graph is a graphics object made of two arrays X and Y with npoints each.
virtual DecisionTreeNode * GetLeft() const
std::vector< const TMVA::Event * > fValidationSample
std::vector< DecisionTree * > fForest
virtual DecisionTreeNode * GetRight() const
Bool_t IsSignal(const Event *ev) const
void DrawProgressBar(Int_t, const TString &comment="")
draws progress bar in color or B&W caution:
std::vector< Double_t > GetVariableImportance()
Return the relative variable importance, normalized to all variables together having the importance 1...
Double_t fFValidationEvents
std::vector< Double_t > fLowSigCut
std::vector< Float_t > * fRegressionReturnVal
Double_t Atof() const
Return floating-point value contained in string.
void UpdateTargetsRegression(std::vector< const TMVA::Event *> &, Bool_t first=kFALSE)
Calculate current residuals for all events and update targets for next iteration. ...
Types::EAnalysisType GetAnalysisType() const
A TTree object has a header with a name and a title.
Short_t GetSelector() const
std::map< const TMVA::Event *, std::vector< double > > fResiduals
void Store(TObject *obj, const char *alias=0)
Double_t Sqrt(Double_t x)
virtual void Set(Int_t n)
Set number of points in the graph Existing coordinates are preserved New coordinates above fNpoints a...
double norm(double *x, double *p)
std::vector< const TMVA::Event * > fSubSample
Int_t CeilNint(Double_t x)
void InitGradBoost(std::vector< const TMVA::Event *> &)
initialize targets for first tree
void NoErrorCalc(Double_t *const err, Double_t *const errUpper)
void SetSignalReferenceCut(Double_t cut)
virtual const char * GetTitle() const
Returns title of object.
std::vector< double > fBoostWeights