$darkmode
DENOPTIM
ParallelFragmentationAlgorithm.java
Go to the documentation of this file.
1/*
2 * DENOPTIM
3 * Copyright (C) 2022 Marco Foscato <marco.foscato@uib.no>
4 *
5 * This program is free software: you can redistribute it and/or modify
6 * it under the terms of the GNU Affero General Public License as published
7 * by the Free Software Foundation, either version 3 of the License, or
8 * (at your option) any later version.
9 *
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU Affero General Public License for more details.
14 *
15 * You should have received a copy of the GNU Affero General Public License
16 * along with this program. If not, see <http://www.gnu.org/licenses/>.
17 */
18
19package denoptim.fragmenter;
20
21import java.io.File;
22import java.io.FileFilter;
23import java.io.FileNotFoundException;
24import java.io.IOException;
25import java.util.ArrayList;
26import java.util.Arrays;
27import java.util.Comparator;
28import java.util.LinkedHashMap;
29import java.util.List;
30import java.util.logging.Level;
31import java.util.stream.Collectors;
32
33import org.apache.commons.io.FileUtils;
34import org.openscience.cdk.exception.CDKException;
35import org.openscience.cdk.interfaces.IAtomContainer;
36import org.openscience.cdk.interfaces.IBond;
37import org.openscience.cdk.io.iterator.IteratingSMILESReader;
38
39import denoptim.constants.DENOPTIMConstants;
40import denoptim.exception.DENOPTIMException;
41import denoptim.files.FileFormat;
42import denoptim.io.DenoptimIO;
43import denoptim.io.IteratingAtomContainerReader;
44import denoptim.programs.RunTimeParameters.ParametersType;
45import denoptim.programs.fragmenter.FragmenterParameters;
46import denoptim.task.ParallelAsynchronousTaskExecutor;
47import denoptim.utils.MoleculeUtils;
48
49
57{
58
62 private File[] structures;
63
68
69
70//-----------------------------------------------------------------------------
71
77 {
79 this.settings = settings;
80 }
81
82//------------------------------------------------------------------------------
83
84 protected boolean doPreFlightOperations()
85 {
87 try
88 {
90 (new File(settings.getStructuresFile()));
91
92 } catch (IOException | CDKException e1)
93 {
94 throw new Error("Error reading file '" + settings.getStructuresFile()
95 + "'. " + e1.getMessage());
96 }
97 // Detect dimensionality of the molecules
98 if (reader.getIteratorType().equals(IteratingSMILESReader.class))
99 {
101 }
102 // Split data in batches for parallelization
103
104 // This is the collector of the mutating pathname to the file collecting
105 // the input structures for each thread.
106 structures = new File[settings.getNumTasks()];
107 structures[0] = new File(settings.getStructuresFile());
108
109 // WARNING while splitting the molecules we also do the preprocessing of
110 // the molecules. This to avoid having to read them once again. Yet,
111 // if we have no checks to be done, we are effectively copy-pasting
112 // the file with the list of molecules to chop.
114 for (int i=0; i<settings.getNumTasks(); i++)
115 {
117 }
118 return true;
119 }
120
121//------------------------------------------------------------------------------
122
123 protected void createAndSubmitTasks()
124 {
125 for (int i=0; i<settings.getNumTasks(); i++)
126 {
127 FragmenterTask task;
128 try
129 {
130 task = new FragmenterTask(structures[i], settings, i);
131 } catch (SecurityException | IOException e)
132 {
133 throw new Error("Unable to start fragmentation thread.",e);
134 }
135 submitTask(task, task.getLogFilePathname());
136 }
137 }
138
139//------------------------------------------------------------------------------
140
141 protected boolean doPostFlightOperations()
142 {
143 // Identify (and possibly collect) final results. The files collecting
144 // results change depending on the task we have done, and on whether
145 // we ran them in a parallelized fashion or not.
146 List<File> resultFiles = new ArrayList<File>();
148 {
151 try
152 {
153 extractor.run();
154 } catch (Exception e)
155 {
156 throw new Error("Could not extract the most common conformer. "
157 + e.getMessage(), e);
158 }
159 for (String pathname : extractor.getResults())
160 {
161 resultFiles.add(new File(pathname));
162 }
163 } else {
165 {
166 // We collect only the unique champion of each isomorphic family.
168 new File(settings.getWorkDirectory()));
169 } else if (settings.getNumTasks()>1) {
170 resultFiles = results.stream()
171 .map(obj -> (String) obj)
172 .map(pathname -> new File(pathname))
173 .collect(Collectors.toList());
174 } else if (settings.getNumTasks()==1) {
175 // We should be here only when we run on single thread with no
176 // handling of isomorphic families (i.e., no removal of
177 // duplicates)
178 resultFiles.add(new File ((String) results.get(0)));
179 }
180 }
181
182 // In case we did not produce anything
183 if (resultFiles.size()==0)
184 {
185 settings.getLogger().log(Level.INFO, "No results to collect. "
186 + "All done.");
187 return true;
188 }
189
190 // If we did produce something, we go ahead
191 File allFragsFile;
192 FileFormat outputFormat = null;
194 {
195 allFragsFile = new File(FragmenterTask.getFragmentsFileName(
196 settings));
198 } else {
199 allFragsFile = new File(FragmenterTask.getResultsFileName(
200 settings));
201 outputFormat = FileFormat.MOLSDF;
202 }
203
204 switch (outputFormat)
205 {
206 case MOLSDF:
207 try
208 {
209 FileUtils.copyFile(resultFiles.get(0), allFragsFile);
210 if (resultFiles.size()>1)
211 {
212 DenoptimIO.appendTxtFiles(allFragsFile,
213 resultFiles.subList(1,resultFiles.size()));
214 }
215 } catch (IOException e)
216 {
217 throw new Error("Unable to create new file '"
218 + allFragsFile + "'",e);
219 }
220 break;
221
222 case VRTXSDF:
223 try
224 {
225 FileUtils.copyFile(resultFiles.get(0), allFragsFile);
226 if (resultFiles.size()>1)
227 {
228 DenoptimIO.appendTxtFiles(allFragsFile,
229 resultFiles.subList(1,resultFiles.size()));
230 }
231 } catch (IOException e)
232 {
233 throw new Error("Unable to create new file '"
234 + allFragsFile + "'",e);
235 }
236 break;
237
238 case VRTXJSON:
239 //TODO
240 // also check allFragsFile: it already contains extension.
241 throw new Error("NOT IMPLEMENTED YET!");
242
243
244
245 default:
246 throw new Error("Unexpected format "
248 + "for final collection of fragments");
249 }
250
251 settings.getLogger().log(Level.INFO, "Results "
252 + "collected in file " + allFragsFile);
253
254 return true;
255 }
256
257//------------------------------------------------------------------------------
258
260 File workDir)
261 {
262 List<File> files = Arrays.stream(workDir.listFiles(new FileFilter(){
263 @Override
264 public boolean accept(File pathname) {
265 if (pathname.getName().startsWith(
267 && pathname.getName().contains(
269 {
270 return true;
271 }
272 return false;
273 }
274 })).collect(Collectors.toList());
275 files.sort(new Comparator<File>() {
276
277 @Override
278 public int compare(File o1, File o2)
279 {
280 // The filename is like "MWSlot_50-52_Unq.sdf"
281 String s1 = o1.getName().replace(
283 int i1 = Integer.valueOf(s1.substring(0,s1.indexOf("-")));
284 String s2 = o2.getName().replace(
286 int i2 = Integer.valueOf(s2.substring(0,s2.indexOf("-")));
287 return Integer.compare(i1, i2);
288 }
289
290 });
291 return files;
292 }
293
294//------------------------------------------------------------------------------
295
309 {
310 int maxBuffersSize = 50000;
311 int numBatches = settings.getNumTasks();
312
313 //If available we record CSD formula in properties of atom container
314 LinkedHashMap<String,String> formulae = settings.getFormulae();
315
317 {
318 settings.getLogger().log(Level.INFO, "Combining structures and "
319 + "formulae...");
320 }
321 int index = -1;
322 int batchId = 0;
323 int buffersSize = 0;
324 boolean relyingOnListSize = false;
325 List<ArrayList<IAtomContainer>> batches =
326 new ArrayList<ArrayList<IAtomContainer>>();
327 for (int i=0; i<numBatches; i++)
328 {
329 batches.add(new ArrayList<IAtomContainer>());
330 }
331 try
332 {
333 while (reader.hasNext())
334 {
335 index++;
336 buffersSize++;
337 IAtomContainer mol = reader.next();
338
339 // Adjust molecular representation to our settings
341 index))
342 continue;
343
344 // It is convenient to place the formula in the atom container
345 if (formulae!=null && settings.doCheckFormula())
346 {
347 getFormulaForMol(mol, index, formulae);
348 }
349
350 batches.get(batchId).add(mol);
351
352 // Update batch ID for next mol
353 batchId++;
354 if (batchId >= numBatches)
355 batchId = 0;
356
357 // If max buffer size is reached, then bump to file
358 if (buffersSize >= maxBuffersSize)
359 {
360 buffersSize = 0;
361 for (int i=0; i<numBatches; i++)
362 {
363 String filename = getStructureFileNameBatch(settings, i);
364 try
365 {
366 DenoptimIO.writeSDFFile(filename, batches.get(i), true);
367 } catch (DENOPTIMException e)
368 {
369 throw new Error("Cannot write to '" + filename + "'.");
370 }
371 batches.get(i).clear();
372 }
373 }
374 }
375 } finally {
376 try {
377 reader.close();
378 } catch (IOException e1)
379 {
380 throw new Error("Could not close reader of SDF file '"
381 + settings.getStructuresFile() + "'",e1);
382 }
383 }
384
385 if (buffersSize < maxBuffersSize)
386 {
387 for (int i=0; i<numBatches; i++)
388 {
389 String filename = getStructureFileNameBatch(settings, i);
390 try
391 {
392 DenoptimIO.writeSDFFile(filename, batches.get(i), true);
393 } catch (DENOPTIMException e)
394 {
395 throw new Error("Cannot write to '" + filename + "'.");
396 }
397 batches.get(i).clear();
398 }
399 }
400
401 // Check for consistency in the list of formulae
402 if (formulae!=null && relyingOnListSize
403 && index != (formulae.size()-1))
404 {
405 throw new Error("Inconsistent number of formulae "
406 + "(" + formulae.size() + ") "
407 + "and structures ("+ index + ") when using the index "
408 + "in the list of formulae as identifier. For your "
409 + "sake this in not allowed.");
410 }
411 }
412
413//------------------------------------------------------------------------------
414
424 {
425 return settings.getWorkDirectory() + DenoptimIO.FS
426 + "structuresBatch-" + i + ".sdf";
427 }
428
429//------------------------------------------------------------------------------
430
437 private static boolean getFormulaForMol(IAtomContainer mol, int index,
438 LinkedHashMap<String,String> formulae)
439 {
440 boolean relyingOnListSize = false;
441
442 List<String> formulaeList = new ArrayList<String>(formulae.values());
443
444 String molName = mol.getTitle();
445 if (molName!=null && !molName.isBlank())
446 {
447 if (formulae.containsKey(molName))
448 {
449 mol.setProperty(DENOPTIMConstants.FORMULASTR,
450 formulae.get(molName));
451 } else {
452 relyingOnListSize = true;
453 if (index<formulae.size())
454 {
455 mol.setProperty(DENOPTIMConstants.FORMULASTR,
456 formulaeList.get(index));
457 } else {
458 throw new Error("There are not "
459 + "enough formulae! Looking for "
460 + "formula #"+ index + " but there are "
461 + "only " + formulae.size()
462 + "entries.");
463 }
464 }
465 } else {
466 relyingOnListSize = true;
467 if (index<formulae.size())
468 {
469 mol.setProperty(DENOPTIMConstants.FORMULASTR,
470 formulaeList.get(index));
471 } else {
472 throw new Error("There are not "
473 + "enough formulae! Looking for "
474 + "formula #"+ index + " but there are "
475 + "only " + formulae.size()
476 + "entries.");
477 }
478 }
479 return relyingOnListSize;
480 }
481
482//------------------------------------------------------------------------------
483
484}
General set of constants used in DENOPTIM.
static final Object FORMULASTR
Property name used to store molecular formula as string in an atom container.
static final String MWSLOTFRAGSUNQFILENANEEND
Final part of filename used to collect unique fragments in a certain molecular weight slot.
static final FileFormat TMPFRAGFILEFORMAT
Format for intermediate files used during fragmentation.
static final String MWSLOTFRAGSFILENAMEROOT
Initial part of filename used to collect fragments belonging to a certain molecular weight slot.
Task that performs the various steps in the process that prepares chemical structured to be chopped,...
static String getFragmentsFileName(FragmenterParameters settings, int i)
Builds the pathname of the structure file meant to hold fragments resulting from this task.
static String getResultsFileName(FragmenterParameters settings)
Builds the pathname of the structure file meant to hold results that are not necessarily fragments.
static boolean prepareMolToFragmentation(IAtomContainer mol, FragmenterParameters settings, int index)
Do any pre-processing on a IAtomContainer meant to be fragmented.
Runs threads that extract the most representative conformer of fragments given as input.
List< String > getResults()
Returns the list of pathnames collecting the most representative conformers, as defined by the settin...
Fragments a list of chemical systems by running parallel fragmentation tasks.
static List< File > getFilesCollectingIsomorphicFamilyChampions(File workDir)
void createAndSubmitTasks()
Implementations of this method must call the submitTask(Task, String) method to actually send the tas...
static String getStructureFileNameBatch(FragmenterParameters settings, int i)
Builds the pathname of the structure file generated for one of the parallel threads.
FragmenterParameters settings
All settings controlling the tasks executed by this class.
ParallelFragmentationAlgorithm(FragmenterParameters settings)
Constructor.
static void splitInputForThreads(FragmenterParameters settings, IteratingAtomContainerReader reader)
Splits the input data (from FragmenterParameters) into batches suitable for parallel batch processing...
static boolean getFormulaForMol(IAtomContainer mol, int index, LinkedHashMap< String, String > formulae)
Takes the molecular formula from the given list of formulae and using the 'Title' property of the ind...
Utility methods for input/output.
static void writeSDFFile(String fileName, IAtomContainer mol)
Writes IAtomContainer to SDF file.
static void appendTxtFiles(File f1, List< File > files)
Appends the second file to the first.
An iterator that take IAtomContainers from a file, possibly using an available iterating reader,...
void close()
Close the memory-efficient iterator if any is open.
String getWorkDirectory()
Gets the pathname to the working directory.
Logger getLogger()
Get the name of the program specific logger.
Parameters controlling execution of the fragmenter.
boolean doFragmentation
Fag requesting the fragmentation of the structures.
boolean doExtactRepresentativeConformer
Flag signaling the request to analyze each isomorphic family to extract the most representative fragm...
void setWorkingIn3D(boolean workingIn3D)
Sets boolean variable workingIn3D.
final List< Object > results
List of object returned by completed tasks.
File formats identified by DENOPTIM.
Definition: FileFormat.java:32