Объединить N файлов журнала, сохраняя хронологический порядок

Question

Объединить N файлов журнала, сохраняя хронологический порядок

У меня есть N различных файлов журнала, поступающих от N различных служб, работающих на нашем устройстве. Я хочу объединить N файлов в один файл в хронологическом порядке. Размер файла может варьироваться от нескольких КБ до ГБ.

Файлы журнала N имеют одинаковый формат и имеют вид:

**********  LOGGING SESSION STARTED ************
* Hmsoa Version: 2.4.0.12
* Exe Path: c:\program files (x86)\silicon biosystems\deparray300a_driver\deparray300a_driver.exe
* Exe Version: 1.6.0.154
************************************************TIME = 2017/02/01 11:12:12,180 ; THID = 4924; CAT = ; LVL = 1000; LOG = API 'Connect'->Enter;
TIME = 2017/02/01 11:12:12,196 ; THID = 4924; CAT = ; LVL = 1000; LOG = API 'Connect'->Exit=0;
TIME = 2017/02/01 11:12:12,196 ; THID = 4924; CAT = ; LVL = 1000; LOG = API 'CCisProxyLocal CONNECT - ok'->Enter;
TIME = 2017/02/01 11:12:12,196 ; THID = 4924; CAT = ; LVL = 1000; LOG = API 'CRecoveryAxesProxyLocal CONNECT - ok'->Enter;
TIME = 2017/02/01 11:12:12,196 ; THID = 4924; CAT = ; LVL = 1000; LOG = API 'CAmplifierProxyLocalV3 CONNECT - ok'->Enter;
TIME = 2017/02/01 11:12:12,196 ; THID = 4924; CAT = ; LVL = 1000; LOG = API 'SYSTEM_DIAGNOSIS_GET'->Enter;
TIME = 2017/02/01 11:12:12,211 ; THID = 4924; CAT = ; LVL = 1000; LOG = API 'SYSTEM_DIAGNOSIS_GET'->Exit=0;
TIME = 2017/02/01 11:12:12,211 ; THID = 4924; CAT = ; LVL = 1000; LOG = API 'LBL_SQUARE_SET'->Enter;
TIME = 2017/02/01 11:12:12,219 ; THID = 4924; CAT = ; LVL = 1000; LOG = API 'LBL_SQUARE_SET'->Exit=0;

Поскольку у меня уже есть N разных файлов, я до сих пор применял алгоритм внешней сортировки, считывающий по одной строке для каждого файла:

#include "stdafx.h"#include "boost/regex.hpp"#include "boost/lexical_cast.hpp"#include "boost\filesystem.hpp"#include <string>
#include <fstream>
#include <iostream>
#include <algorithm>
#include <sstream>
#include <climits>
#include <ctime>
namespace fs = boost::filesystem;

static const boost::regex expression(R"(^(?:(?:TIME\s=\s\d{4}\/\d{2}\/\d{2}\s)|(?:@))([0-9:.,]+))");
static const boost::regex nameFileEx(R"(^[\d\-\_]+(\w+\s?\w+|\w+))");
static const std::string path("E:\\2017-02-01");
//static const std::string path("E:\\TestLog");

unsigned long time2Milleseconds(const std::string & time)
{
int a, b, c, d;
if (sscanf_s(time.c_str(), "%d:%d:%d,%d", &a, &b, &c, &d) >= 3)
return a * 3600000 + b * 60000 + c * 1000 + d;
}

void readAllFilesUntilLine7(std::vector<std::pair<std::ifstream, std::string>> & vifs)
{
std::string line;
for (int i = 0; i < vifs.size(); ++i)
{
int lineNumber = 0;
while (lineNumber != 7 && std::getline(vifs[i].first, line))
{
++lineNumber;
}
}
}

void checkRegex(std::vector<std::pair<std::ifstream, std::string>> & vifs, std::vector<unsigned long> & logTime, std::vector<std::string> & lines, int index, int & counter)
{
std::string line;
boost::smatch what;
if (std::getline(vifs[index].first, line))
{
if (boost::regex_search(line, what, expression))
{
logTime[index] = time2Milleseconds(what[1]);
}
lines[index] = line;
}
else
{
--counter;
logTime[index] = ULONG_MAX;
}
}

void mergeFiles(std::vector<std::pair<std::ifstream, std::string>> & vifs, std::vector<unsigned long> & logTime, std::vector<std::string> & lines, std::ofstream & file, int & counter)
{
std::string line;
boost::smatch what;
int index = 0;
for (int i = 0; i < vifs.size(); ++i)
{
checkRegex(vifs, logTime, lines, i, counter);
}
index = min_element(logTime.begin(), logTime.end()) - logTime.begin();
file << lines[index] << " --> " << vifs[index].second << "\n";
while (true)
{
checkRegex(vifs, logTime, lines, index, counter);
index = min_element(logTime.begin(), logTime.end()) - logTime.begin();
if (0 == counter)
break;
file << lines[index] << " --> " << vifs[index].second << "\n";
}
}

int main()
{
clock_t begin = clock();
int cnt = std::count_if(fs::directory_iterator(path),fs::directory_iterator(),static_cast<bool(*)(const fs::path&)>(fs::is_regular_file));
std::vector<std::pair<std::ifstream, std::string>> vifs(cnt);
int index = 0;
boost::smatch what;
std::string file;
for (fs::directory_iterator d(path); d != fs::directory_iterator(); ++d)
{
if (fs::is_regular_file(d->path()))
{
file = d->path().filename().string();
if (boost::regex_search(file, what, nameFileEx))
{
vifs[index++] = std::make_pair(std::ifstream(d->path().string()), what[1]);
}
}
}
std::vector<unsigned long> logTime(cnt, ULONG_MAX);
std::vector<std::string> lines(cnt);
std::ofstream filename(path + "\\TestLog.txt");
readAllFilesUntilLine7(vifs);
mergeFiles(vifs, logTime, lines, filename, cnt);
filename.close();
clock_t end = clock();
double elapsed_secs = double(end - begin) / CLOCKS_PER_SEC;
std::cout << "Elapsed time = " << elapsed_secs << "\n";
return 0;
}

Он делает именно то, что должен делать, но медленно. Чтобы объединить 82 файла с размерами от 1 КБ до 250 МБ и создать окончательный файл с более чем 6000000 строк, требуется 70 минут.

Как я могу ускорить алгоритм? Любая помощь очень ценится!

ОБНОВИТЬ

Я также реализовал версию с кучей:

Data.h:

#pragma once

#include <string>

class Data
{
public:
Data(DWORD index,
const std::string & line,
ULONG time);
~Data();
inline const ULONG getTime() const  {return time; }
inline const DWORD getIndex() const { return index; }
inline const std::string getLine() const { return line; }
private:
DWORD index;
std::string line;
ULONG time;
};

class Compare
{
public:
bool operator()(const Data & lhs, const Data & rhs) { return lhs.getTime() > rhs.getTime(); };
};

Data.cpp:

#include "stdafx.h"#include "Data.h"

Data::Data(DWORD i_index,
const std::string & i_line,
ULONG i_time)
: index(i_index)
, line(i_line)
, time(i_time)
{
}Data::~Data()
{
}

main.cpp:

#include "stdafx.h"#include "boost/regex.hpp"#include "boost/lexical_cast.hpp"#include "boost\filesystem.hpp"#include <string>
#include <fstream>
#include <iostream>
#include <algorithm>
#include <sstream>
#include <climits>
#include <ctime>
#include <queue>
#include "Data.h"namespace fs = boost::filesystem;

static const boost::regex expression(R"(^(?:(?:TIME\s=\s\d{4}\/\d{2}\/\d{2}\s)|(?:@))([0-9:.,]+))");
static const boost::regex nameFileEx(R"(^[\d\-\_]+(\w+\s?\w+|\w+))");
static const std::string path("E:\\2017-02-01");
//static const std::string path("E:\\TestLog");

unsigned long time2Milleseconds(const std::string & time)
{
int a, b, c, d;
if (sscanf_s(time.c_str(), "%d:%d:%d,%d", &a, &b, &c, &d) >= 3)
return a * 3600000 + b * 60000 + c * 1000 + d;
}

void initializeHeap(std::ifstream & ifs, std::priority_queue<Data, std::vector<Data>, Compare> & myHeap, const int index)
{
ULONG time;
std::string line;
boost::smatch what;
bool match = false;
while (!match && std::getline(ifs, line))
{
if (boost::regex_search(line, what, expression))
{
time = time2Milleseconds(what[1]);
myHeap.push(Data(index, line, time));
match = true;
}
}
}

void checkRegex(std::vector<std::pair<std::ifstream, std::string>> & vifs, std::priority_queue<Data, std::vector<Data>, Compare> & myHeap, ULONG time, const int index)
{
std::string line;
boost::smatch what;
if (std::getline(vifs[index].first, line))
{
if (boost::regex_search(line, what, expression))
{
time = time2Milleseconds(what[1]);
}
myHeap.push(Data(index, line, time));
}
}

void mergeFiles(std::vector<std::pair<std::ifstream, std::string>> & vifs, std::priority_queue<Data, std::vector<Data>, Compare> & myHeap, std::ofstream & file)
{
int index = 0;
ULONG time = 0;
while (!myHeap.empty())
{
index = myHeap.top().getIndex();
time = myHeap.top().getTime();
file << myHeap.top().getLine() << " --> " << vifs[index].second << "\n";
myHeap.pop();
checkRegex(vifs, myHeap, time, index);
}
}

int main()
{
clock_t begin = clock();
int cnt = std::count_if(fs::directory_iterator(path), fs::directory_iterator(), static_cast<bool(*)(const fs::path&)>(fs::is_regular_file));
std::priority_queue<Data, std::vector<Data>, Compare> myHeap;
std::vector<std::pair<std::ifstream, std::string>> vifs(cnt);
int index = 0;
boost::smatch what;
std::string file;
for (fs::directory_iterator d(path); d != fs::directory_iterator(); ++d)
{
if (fs::is_regular_file(d->path()))
{
file = d->path().filename().string();
if (boost::regex_search(file, what, nameFileEx))
{
vifs[index] = std::make_pair(std::ifstream(d->path().string()), what[1]);
initializeHeap(vifs[index].first, myHeap, index);
++index;
}
}
}
std::ofstream filename(path + "\\TestLog.txt");
mergeFiles(vifs, myHeap, filename);
filename.close();
clock_t end = clock();
double elapsed_secs = double(end - begin) / CLOCKS_PER_SEC;
std::cout << "Elapsed time = " << elapsed_secs << "\n";
return 0;
}

После всей этой работы я понял, что вчера я запускаю программу в Debug. Запустив обе реализации в Release, я получил следующие результаты:

Векторная реализация: около 25 секунд
Реализация кучи: около 27 секунд

Таким образом, либо моя реализация со структурой кучи не оптимизирована, либо две реализации равны во время выполнения.

Есть ли что-то еще, что я могу сделать, чтобы ускорить выполнение?

0

algorithm c++external-sorting

Решение

Другие решения

Других решений пока нет …

Источник

Accepted Answer

Это можно сделать быстрее и с небольшим объемом памяти. Рассмотрим сначала:

Читать одну строку из каждого файла (так что только N строки в памяти в любое время).
Найти самый маленький из N линии, выведите его.
В памяти замените только что выведенное значение следующей строкой из файла, из которого получен текущий вывод (позаботьтесь о случае EOF).

Если M это длина вашего выходного файла (т.е. длина всех журналов вместе взятых), тогда тривиальная реализация будет O(N * M),

Тем не менее, вышеупомянутое может быть улучшено с помощью кучи, которая сокращает время до O(M log N), То есть положить N элементы в памяти в куче. Выскочить из верхней части, чтобы вывести самый маленький элемент. Затем, когда вы читаете новую строку, просто отбросьте ее обратно в кучу.

2