$darkmode
DENOPTIM
ParallelFragmentationAlgorithm.java
Go to the documentation of this file.
1/*
2 * DENOPTIM
3 * Copyright (C) 2022 Marco Foscato <marco.foscato@uib.no>
4 *
5 * This program is free software: you can redistribute it and/or modify
6 * it under the terms of the GNU Affero General Public License as published
7 * by the Free Software Foundation, either version 3 of the License, or
8 * (at your option) any later version.
9 *
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU Affero General Public License for more details.
14 *
15 * You should have received a copy of the GNU Affero General Public License
16 * along with this program. If not, see <http://www.gnu.org/licenses/>.
17 */
18
19package denoptim.fragmenter;
20
21import java.io.File;
22import java.io.FileFilter;
23import java.io.FileNotFoundException;
24import java.io.IOException;
25import java.util.ArrayList;
26import java.util.Arrays;
27import java.util.Comparator;
28import java.util.LinkedHashMap;
29import java.util.List;
30import java.util.logging.Level;
31import java.util.stream.Collectors;
32
33import org.apache.commons.io.FileUtils;
34import org.openscience.cdk.exception.CDKException;
35import org.openscience.cdk.interfaces.IAtomContainer;
36import org.openscience.cdk.io.iterator.IteratingSMILESReader;
37
38import denoptim.constants.DENOPTIMConstants;
39import denoptim.exception.DENOPTIMException;
40import denoptim.files.FileFormat;
41import denoptim.io.DenoptimIO;
42import denoptim.io.IteratingAtomContainerReader;
43import denoptim.programs.fragmenter.FragmenterParameters;
44import denoptim.task.ParallelAsynchronousTaskExecutor;
45
46
54{
55
59 private File[] structures;
60
65
66
67//-----------------------------------------------------------------------------
68
74 {
76 this.settings = settings;
77 }
78
79//------------------------------------------------------------------------------
80
81 protected boolean doPreFlightOperations()
82 {
84 try
85 {
87 (new File(settings.getStructuresFile()));
88
89 } catch (IOException | CDKException e1)
90 {
91 throw new Error("Error reading file '" + settings.getStructuresFile()
92 + "'. " + e1.getMessage());
93 }
94 // Detect dimensionality of the molecules
95 if (reader.getIteratorType().equals(IteratingSMILESReader.class))
96 {
98 }
99 // Split data in batches for parallelization
100
101 // This is the collector of the mutating pathname to the file collecting
102 // the input structures for each thread.
103 structures = new File[settings.getNumTasks()];
104 structures[0] = new File(settings.getStructuresFile());
105
106 // WARNING while splitting the molecules we also do the preprocessing of
107 // the molecules. This to avoid having to read them once again. Yet,
108 // if we have no checks to be done, we are effectively copy-pasting
109 // the file with the list of molecules to chop.
111 for (int i=0; i<settings.getNumTasks(); i++)
112 {
114 }
115 return true;
116 }
117
118//------------------------------------------------------------------------------
119
120 protected void createAndSubmitTasks()
121 {
122 for (int i=0; i<settings.getNumTasks(); i++)
123 {
124 FragmenterTask task;
125 try
126 {
127 task = new FragmenterTask(structures[i], settings, i);
128 } catch (SecurityException | IOException e)
129 {
130 throw new Error("Unable to start fragmentation thread.",e);
131 }
132 submitTask(task, task.getLogFilePathname());
133 }
134 }
135
136//------------------------------------------------------------------------------
137
138 protected boolean doPostFlightOperations()
139 {
140 // Identify (and possibly collect) final results. The files collecting
141 // results change depending on the task we have done, and on whether
142 // we ran them in a parallelized fashion or not.
143 List<File> resultFiles = new ArrayList<File>();
145 {
148 try
149 {
150 extractor.run();
151 } catch (Exception e)
152 {
153 throw new Error("Could not extract the most common conformer. "
154 + e.getMessage(), e);
155 }
156 for (String pathname : extractor.getResults())
157 {
158 resultFiles.add(new File(pathname));
159 }
160 } else {
162 {
163 // We collect only the unique champion of each isomorphic family.
165 new File(settings.getWorkDirectory()));
166 } else if (settings.getNumTasks()>1) {
167 resultFiles = results.stream()
168 .map(obj -> (String) obj)
169 .map(pathname -> new File(pathname))
170 .collect(Collectors.toList());
171 } else if (settings.getNumTasks()==1) {
172 // We should be here only when we run on single thread with no
173 // handling of isomorphic families (i.e., no removal of
174 // duplicates)
175 resultFiles.add(new File ((String) results.get(0)));
176 }
177 }
178
179 // In case we did not produce anything
180 if (resultFiles.size()==0)
181 {
182 settings.getLogger().log(Level.INFO, "No results to collect. "
183 + "All done.");
184 return true;
185 }
186
187 // If we did produce something, we go ahead
188 File allFragsFile;
189 FileFormat outputFormat = null;
191 {
192 allFragsFile = new File(FragmenterTask.getFragmentsFileName(
193 settings));
195 } else {
196 allFragsFile = new File(FragmenterTask.getResultsFileName(
197 settings));
198 outputFormat = FileFormat.MOLSDF;
199 }
200
201 switch (outputFormat)
202 {
203 case MOLSDF:
204 try
205 {
206 FileUtils.copyFile(resultFiles.get(0), allFragsFile);
207 if (resultFiles.size()>1)
208 {
209 DenoptimIO.appendTxtFiles(allFragsFile,
210 resultFiles.subList(1,resultFiles.size()));
211 }
212 } catch (IOException e)
213 {
214 throw new Error("Unable to create new file '"
215 + allFragsFile + "'",e);
216 }
217 break;
218
219 case VRTXSDF:
220 try
221 {
222 FileUtils.copyFile(resultFiles.get(0), allFragsFile);
223 if (resultFiles.size()>1)
224 {
225 DenoptimIO.appendTxtFiles(allFragsFile,
226 resultFiles.subList(1,resultFiles.size()));
227 }
228 } catch (IOException e)
229 {
230 throw new Error("Unable to create new file '"
231 + allFragsFile + "'",e);
232 }
233 break;
234
235 case VRTXJSON:
236 //TODO
237 // also check allFragsFile: it already contains extension.
238 throw new Error("NOT IMPLEMENTED YET!");
239
240
241
242 default:
243 throw new Error("Unexpected format "
245 + "for final collection of fragments");
246 }
247
248 settings.getLogger().log(Level.INFO, "Results "
249 + "collected in file " + allFragsFile);
250
251 return true;
252 }
253
254//------------------------------------------------------------------------------
255
257 File workDir)
258 {
259 List<File> files = Arrays.stream(workDir.listFiles(new FileFilter(){
260 @Override
261 public boolean accept(File pathname) {
262 if (pathname.getName().startsWith(
264 && pathname.getName().contains(
266 {
267 return true;
268 }
269 return false;
270 }
271 })).collect(Collectors.toList());
272 files.sort(new Comparator<File>() {
273
274 @Override
275 public int compare(File o1, File o2)
276 {
277 // The filename is like "MWSlot_50-52_Unq.sdf"
278 String s1 = o1.getName().replace(
280 int i1 = Integer.valueOf(s1.substring(0,s1.indexOf("-")));
281 String s2 = o2.getName().replace(
283 int i2 = Integer.valueOf(s2.substring(0,s2.indexOf("-")));
284 return Integer.compare(i1, i2);
285 }
286
287 });
288 return files;
289 }
290
291//------------------------------------------------------------------------------
292
306 {
307 int maxBuffersSize = 50000;
308 int numBatches = settings.getNumTasks();
309
310 //If available we record CSD formula in properties of atom container
311 LinkedHashMap<String,String> formulae = settings.getFormulae();
312
314 {
315 settings.getLogger().log(Level.INFO, "Combining structures and "
316 + "formulae...");
317 }
318 int index = -1;
319 int batchId = 0;
320 int buffersSize = 0;
321 boolean relyingOnListSize = false;
322 List<ArrayList<IAtomContainer>> batches =
323 new ArrayList<ArrayList<IAtomContainer>>();
324 for (int i=0; i<numBatches; i++)
325 {
326 batches.add(new ArrayList<IAtomContainer>());
327 }
328 try
329 {
330 while (reader.hasNext())
331 {
332 index++;
333 buffersSize++;
334 IAtomContainer mol = reader.next();
335
336 // Adjust molecular representation to our settings
338 index))
339 continue;
340
341 // It is convenient to place the formula in the atom container
342 if (formulae!=null && settings.doCheckFormula())
343 {
344 getFormulaForMol(mol, index, formulae);
345 }
346
347 batches.get(batchId).add(mol);
348
349 // Update batch ID for next mol
350 batchId++;
351 if (batchId >= numBatches)
352 batchId = 0;
353
354 // If max buffer size is reached, then bump to file
355 if (buffersSize >= maxBuffersSize)
356 {
357 buffersSize = 0;
358 for (int i=0; i<numBatches; i++)
359 {
360 String filename = getStructureFileNameBatch(settings, i);
361 try
362 {
363 DenoptimIO.writeSDFFile(filename, batches.get(i), true);
364 } catch (DENOPTIMException e)
365 {
366 throw new Error("Cannot write to '" + filename + "'.");
367 }
368 batches.get(i).clear();
369 }
370 }
371 }
372 } finally {
373 try {
374 reader.close();
375 } catch (IOException e1)
376 {
377 throw new Error("Could not close reader of SDF file '"
378 + settings.getStructuresFile() + "'",e1);
379 }
380 }
381
382 if (buffersSize < maxBuffersSize)
383 {
384 for (int i=0; i<numBatches; i++)
385 {
386 String filename = getStructureFileNameBatch(settings, i);
387 try
388 {
389 DenoptimIO.writeSDFFile(filename, batches.get(i), true);
390 } catch (DENOPTIMException e)
391 {
392 throw new Error("Cannot write to '" + filename + "'.");
393 }
394 batches.get(i).clear();
395 }
396 }
397
398 // Check for consistency in the list of formulae
399 if (formulae!=null && relyingOnListSize
400 && index != (formulae.size()-1))
401 {
402 throw new Error("Inconsistent number of formulae "
403 + "(" + formulae.size() + ") "
404 + "and structures ("+ index + ") when using the index "
405 + "in the list of formulae as identifier. For your "
406 + "sake this in not allowed.");
407 }
408 }
409
410//------------------------------------------------------------------------------
411
421 {
422 return settings.getWorkDirectory() + DenoptimIO.FS
423 + "structuresBatch-" + i + ".sdf";
424 }
425
426//------------------------------------------------------------------------------
427
434 private static boolean getFormulaForMol(IAtomContainer mol, int index,
435 LinkedHashMap<String,String> formulae)
436 {
437 boolean relyingOnListSize = false;
438
439 List<String> formulaeList = new ArrayList<String>(formulae.values());
440
441 String molName = mol.getTitle();
442 if (molName!=null && !molName.isBlank())
443 {
444 if (formulae.containsKey(molName))
445 {
446 mol.setProperty(DENOPTIMConstants.FORMULASTR,
447 formulae.get(molName));
448 } else {
449 relyingOnListSize = true;
450 if (index<formulae.size())
451 {
452 mol.setProperty(DENOPTIMConstants.FORMULASTR,
453 formulaeList.get(index));
454 } else {
455 throw new Error("There are not "
456 + "enough formulae! Looking for "
457 + "formula #"+ index + " but there are "
458 + "only " + formulae.size()
459 + "entries.");
460 }
461 }
462 } else {
463 relyingOnListSize = true;
464 if (index<formulae.size())
465 {
466 mol.setProperty(DENOPTIMConstants.FORMULASTR,
467 formulaeList.get(index));
468 } else {
469 throw new Error("There are not "
470 + "enough formulae! Looking for "
471 + "formula #"+ index + " but there are "
472 + "only " + formulae.size()
473 + "entries.");
474 }
475 }
476 return relyingOnListSize;
477 }
478
479//------------------------------------------------------------------------------
480
481}
General set of constants used in DENOPTIM.
static final Object FORMULASTR
Property name used to store molecular formula as string in an atom container.
static final String MWSLOTFRAGSUNQFILENANEEND
Final part of filename used to collect unique fragments in a certain molecular weight slot.
static final FileFormat TMPFRAGFILEFORMAT
Format for intermediate files used during fragmentation.
static final String MWSLOTFRAGSFILENAMEROOT
Initial part of filename used to collect fragments belonging to a certain molecular weight slot.
Task that performs the various steps in the process that prepares chemical structured to be chopped,...
static String getFragmentsFileName(FragmenterParameters settings, int i)
Builds the pathname of the structure file meant to hold fragments resulting from this task.
static String getResultsFileName(FragmenterParameters settings)
Builds the pathname of the structure file meant to hold results that are not necessarily fragments.
static boolean prepareMolToFragmentation(IAtomContainer mol, FragmenterParameters settings, int index)
Do any pre-processing on a IAtomContainer meant to be fragmented.
Runs threads that extract the most representative conformer of fragments given as input.
List< String > getResults()
Returns the list of pathnames collecting the most representative conformers, as defined by the settin...
Fragments a list of chemical systems by running parallel fragmentation tasks.
static List< File > getFilesCollectingIsomorphicFamilyChampions(File workDir)
void createAndSubmitTasks()
Implementations of this method must call the submitTask(Task, String) method to actually send the tas...
static String getStructureFileNameBatch(FragmenterParameters settings, int i)
Builds the pathname of the structure file generated for one of the parallel threads.
FragmenterParameters settings
All settings controlling the tasks executed by this class.
ParallelFragmentationAlgorithm(FragmenterParameters settings)
Constructor.
static void splitInputForThreads(FragmenterParameters settings, IteratingAtomContainerReader reader)
Splits the input data (from FragmenterParameters) into batches suitable for parallel batch processing...
static boolean getFormulaForMol(IAtomContainer mol, int index, LinkedHashMap< String, String > formulae)
Takes the molecular formula from the given list of formulae and using the 'Title' property of the ind...
Utility methods for input/output.
static void writeSDFFile(String fileName, IAtomContainer mol)
Writes IAtomContainer to SDF file.
static void appendTxtFiles(File f1, List< File > files)
Appends the second file to the first.
An iterator that take IAtomContainers from a file, possibly using an available iterating reader,...
void close()
Close the memory-efficient iterator if any is open.
String getWorkDirectory()
Gets the pathname to the working directory.
Logger getLogger()
Get the name of the program specific logger.
Parameters controlling execution of the fragmenter.
boolean doFragmentation
Fag requesting the fragmentation of the structures.
boolean doExtactRepresentativeConformer
Flag signaling the request to analyze each isomorphic family to extract the most representative fragm...
void setWorkingIn3D(boolean workingIn3D)
Sets boolean variable workingIn3D.
final List< Object > results
List of object returned by completed tasks.
File formats identified by DENOPTIM.
Definition: FileFormat.java:32