mekhanoidy-tools/src/db_add_language/db_add_language.cpp
2020-01-12 22:16:48 +03:00

322 lines
9.4 KiB
C++

/*
* AIM db_extractor
* Copyright (C) 2017 lzwdgc
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
#include <common.h>
#include <db.h>
#include <Polygon4/DataManager/Localization.h>
#include <Polygon4/DataManager/Storage.h>
#include <Polygon4/DataManager/Types.h>
#include <primitives/filesystem.h>
#include <primitives/executor.h>
#include <primitives/sw/main.h>
#include <primitives/sw/settings.h>
#include <primitives/sw/cl.h>
#include <algorithm>
#include <fstream>
#include <iostream>
#include <iomanip>
#include <math.h>
#include <numeric>
// MultiByteToWideChar: https://msdn.microsoft.com/en-us/library/windows/desktop/dd319072(v=vs.85).aspx
// code pages: https://msdn.microsoft.com/en-us/library/windows/desktop/dd317756(v=vs.85).aspx
const std::map<std::string, int> code_pages
{
{ "en", 0 },
{ "ru", 1251 },
{ "et", 1257 },
};
int get_cp(const std::string &cp)
{
auto i = code_pages.find(cp);
if (i == code_pages.end())
throw std::runtime_error("No code page for lang: " + cp);
return i->second;
}
struct string_index
{
std::wstring s;
polygon4::detail::IdType i = -1;
void setString(const std::string &rhs, int cp)
{
s = str2utf16(rhs, cp);
}
};
using AimKV = std::map<std::string, string_index>;
using AimKVResolved = std::unordered_map<std::string, polygon4::detail::IdType>;
AimKVResolved kv_resolved;
template <class T>
int levenshtein_distance(const T &s1, const T &s2)
{
// To change the type this function manipulates and returns, change
// the return type and the types of the two variables below.
int s1len = s1.size();
int s2len = s2.size();
auto column_start = (decltype(s1len))1;
auto column = new decltype(s1len)[s1len + 1];
std::iota(column + column_start, column + s1len + 1, column_start);
for (auto x = column_start; x <= s2len; x++) {
column[0] = x;
auto last_diagonal = x - column_start;
for (auto y = column_start; y <= s1len; y++) {
auto old_diagonal = column[y];
auto possibilities = {
column[y] + 1,
column[y - 1] + 1,
last_diagonal + (s1[y - 1] == s2[x - 1] ? 0 : 1)
};
column[y] = std::min(possibilities);
last_diagonal = old_diagonal;
}
}
auto result = column[s1len];
delete[] column;
return result;
}
auto open(const path &p)
{
db db;
if (fs::exists(p / "quest.dat"))
db.open(p / "quest");
return db;
};
AimKV get_kv(const db &db, int cp)
{
auto iter_tbl = std::find_if(db.t.tables.begin(), db.t.tables.end(), [](auto &t) {
return t.second.name == "INFORMATION";
});
if (iter_tbl == db.t.tables.end())
throw std::runtime_error("Table INFORMATION was not found");
auto find_field = [&db, &iter_tbl](const std::string &name)
{
auto i = std::find_if(db.t.fields.begin(), db.t.fields.end(), [&iter_tbl, &name](auto &t) {
return t.second.table_id == iter_tbl->second.id && t.second.name == name;
});
if (i == db.t.fields.end())
throw std::runtime_error("Field " + name + " was not found");
return i->first;
};
auto nid = find_field("NAME");
auto tid = find_field("TEXT");
AimKV kv;
for (auto &v : db.values)
{
if (v.table_id != iter_tbl->second.id || v.name.empty())
continue;
for (auto &f : v.fields)
{
if ((f.field_id == nid || f.field_id == tid) && !f.s.empty())
kv[v.name].setString(f.s, cp);
}
}
return kv;
}
AimKVResolved get_kv_resolved(const path &d, const polygon4::Storage &storage)
{
static const auto fn = "kv.resolved";
AimKVResolved mres;
if (fs::exists(fn))
{
std::ifstream f(fn);
std::string s;
polygon4::detail::IdType i;
while (f)
{
f >> std::quoted(s);
if (!f)
break;
f >> i;
mres[s] = i;
}
}
else
{
auto db1 = open(d / "ru" / "aim1");
auto db2 = open(d / "ru" / "aim2");
auto kv1 = get_kv(db1, get_cp("ru"));
auto kv2 = get_kv(db2, get_cp("ru"));
kv1.insert(kv2.begin(), kv2.end());
auto sz = kv1.size();
std::cout << "total kvs: " << sz << "\n";
Executor e;
int i = 0;
for (auto &kv : kv1)
{
e.push([&storage, &i, &sz, &kv]()
{
std::cout << "total kvs: " << ++i << "/" << sz << "\n";
std::map<int, polygon4::detail::IdType> m;
for (auto &s : storage.strings)
m[levenshtein_distance<std::wstring>(kv.second.s, s.second->string.ru)] = s.first;
if (m.empty())
return;
kv.second.i = m.begin()->second;
});
}
e.wait();
std::ofstream f(fn);
for (auto &kv : kv1)
{
mres[kv.first] = kv.second.i;
f << std::quoted(kv.first) << " " << kv.second.i << "\n";
}
}
// make unique ids
std::unordered_map<AimKVResolved::mapped_type, AimKVResolved::key_type> u;
for (auto &kv : mres)
u[kv.second] = kv.first;
mres.clear();
for (auto &kv : u)
mres[kv.second] = kv.first;
return mres;
}
void process_lang(polygon4::Storage &s, const path &p, polygon4::String polygon4::LocalizedString::*field)
{
auto db1 = open(p);
auto db2 = open(p / "aim1");
auto db3 = open(p / "aim2");
AimKV kvm;
auto get_kv = [&kvm, &p](auto &db)
{
AimKV kv1;
if (db.number_of_values)
{
kv1 = ::get_kv(db, get_cp(p.filename().u8string()));
kvm.insert(kv1.begin(), kv1.end());
}
};
get_kv(db1);
get_kv(db2);
get_kv(db3);
std::multimap<int, std::string> dist;
std::multimap<double, std::string> dist2;
for (auto &kv : kvm)
{
auto i = kv_resolved.find(kv.first);
if (i == kv_resolved.end())
continue;
auto &sold = s.strings[i->second]->string.*field;
auto d = levenshtein_distance<std::wstring>(sold, kv.second.s);
dist.insert({ d, kv.first });
//if (d == 0)
// continue;
auto len_diff = abs((int)sold.size() - (int)kv.second.s.size());
auto min_len = (sold.size() + kv.second.s.size()) / 2.0;
//d -= len_diff;
//if (d == 0)
// continue;
dist2.insert({ d / double(min_len), kv.first });
}
std::string str;
for (auto &d2 : dist2)
{
auto &kv = *kvm.find(d2.second);
auto i = kv_resolved.find(kv.first);
if (i == kv_resolved.end())
continue;
auto &sold = s.strings[i->second]->string.*field;
//sold = kv.second.s;
str += "id: " + std::to_string(i->second) + "\n";
str += "kd: " + std::to_string(d2.first) + "\n";
str += "key: " + i->first + "\n\n";
str += "old:\n";
str += wstring2string(sold) + "\n";
str += "\n";
str += "new:\n";
str += wstring2string(kv.second.s) + "\n";
str += "\n================================================\n\n";
}
/*for (auto &kv : kvm)
{
auto i = kv_resolved.find(kv.first);
if (i == kv_resolved.end())
continue;
auto &sold = s.strings[i->second]->string.*field;
//sold = kv.second.s;
str += "id: " + std::to_string(i->second) + "\n";
str += "key: " + i->first + "\n\n";
str += "old:\n";
str += wstring2string(sold) + "\n";
str += "\n";
str += "new:\n";
str += wstring2string(kv.second.s) + "\n";
str += "\n================================================\n\n";
}*/
write_file(p / (p.filename() += "_diff.txt"), str);
}
int main(int argc, char *argv[])
{
cl::opt<path> db_fn(cl::Positional, cl::desc("<db file>"), cl::Required);
cl::opt<path> dir_to_lang_dbs(cl::Positional, cl::desc("<dir to lang dbs>"), cl::Required);
cl::ParseCommandLineOptions(argc, argv);
auto storage = polygon4::initStorage(db_fn);
storage->load();
kv_resolved = get_kv_resolved(dir_to_lang_dbs, *storage.get());
// to check correctness
process_lang(*storage.get(), dir_to_lang_dbs / "ru", &polygon4::LocalizedString::ru);
for (auto &f : fs::directory_iterator(dir_to_lang_dbs))
{
if (!fs::is_directory(f))
continue;
auto p = f.path();
if (0);
#define ADD_LANGUAGE(l, n) else if (p.filename() == #l && p.filename() != "ru") \
{process_lang(*storage.get(), p, &polygon4::LocalizedString::l);}
#include <Polygon4/DataManager/Languages.inl>
#undef ADD_LANGUAGE
else
{
std::cerr << "No such lang: " << p.filename().u8string() << "\n";
continue;
}
}
return 0;
}