LXGZJ237 发表于 2022-12-4 00:34

判断文件编码(UTF8,8BOM,16LE,16BE,ANSI)附string互转wstring、String互转Wchar_...

本帖最后由 2370177068 于 2023-12-24 18:41 编辑

至于是不是原创,我自己也不知道怎么算,很多都是ctrlCV,我拼凑的,能用就行。
前面是字符转换,split是str分割成vector,get_last_error是获取错误文本

代码写得比较屎,你们用之前写个控制台测试一下吧


#pragma once
#ifndef MYTOOL_H
#define MYTOOL_H

#include <stdio.h>
#include <fstream>
#include <iostream>
#include <string>
#include <vector>
#include <atlconv.h>

class MYTOOL {
public:
    MYTOOL() = default;
    ~MYTOOL() = default;
    static std::wstring string2wstring(std::string str);
    static std::string wstring2string(std::wstring wstr);
    static std::vector<std::string> split(std::string str, std::string pattern);
    static void Wchar_tToString(std::string& szDst, wchar_t* wchar);
    static wchar_t* StringToWchar_t(std::string& str);
    static std::string get_last_error(DWORD errCode = GetLastError());
    static int IsUTF8(const void* pBuffer, long size);
    static int CalculateFileEncoding(LPCSTR filePath, std::string& str);
    static int CalculateFileEncodingW(LPCWSTR filePath, std::string& str);
    static std::string 判断文件编码(LPCSTR filePath, std::string& 读到的文本);
    static std::string 判断文件编码W(LPCWSTR filePath, std::string& 读到的文本);
    static std::string 读取文件UTF8(std::string file);
    static std::string 读取文件UTF8W(const wchar_t* file);
private:

};

std::string MYTOOL::读取文件UTF8(std::string file) {
    FILE* fp;
    auto err = _wfopen_s(&fp, string2wstring(file).c_str(), L"r,ccs=UTF-8");
    if (fp == NULL) {
      return "";
    }
    if (err != 0) {
      return "";
    }
    std::string sum;
    wchar_t str = { 0 };
    while (fgetws(str, 1024, fp) != NULL) {
      std::string 当前行内容;
      Wchar_tToString(当前行内容, str);
      sum += 当前行内容;
    }
    fclose(fp);
    return sum;
}

std::string MYTOOL::读取文件UTF8W(const wchar_t* file) {
    FILE* fp;
    auto err = _wfopen_s(&fp, file, L"r,ccs=UTF-8");
    if (fp == NULL) {
      return "";
    }
    if (err != 0) {
      return "";
    }
    std::string sum;
    wchar_t str = { 0 };
    while (fgetws(str, 1024, fp) != NULL) {
      std::string 当前行内容;
      Wchar_tToString(当前行内容, str);
      sum += 当前行内容;
    }
    fclose(fp);
    return sum;
}

std::wstring MYTOOL::string2wstring(std::string str) {
    std::wstring result;
    int len = MultiByteToWideChar(CP_ACP, 0, str.c_str(), (int)str.size(), NULL, 0);
    wchar_t* buffer = new wchar_t;
    MultiByteToWideChar(CP_ACP, 0, str.c_str(), (int)str.size(), buffer, len);
    buffer = '\0';
    result.append(buffer);
    delete[] buffer;
    return result;
}

std::string MYTOOL::wstring2string(std::wstring wstr) {
    std::string result;
    int len = WideCharToMultiByte(CP_ACP, 0, wstr.c_str(), (int)wstr.size(), NULL, 0, NULL, NULL);
    char* buffer = new char;
    WideCharToMultiByte(CP_ACP, 0, wstr.c_str(), (int)wstr.size(), buffer, len, NULL, NULL);
    buffer = '\0';
    result.append(buffer);
    delete[] buffer;
    return result;
}

std::vector<std::string> MYTOOL::split(std::string str, std::string pattern) {
    int pos;
    std::vector<std::string> result;
    str += pattern;
    int size = (int)str.size();
    for (int i = 0; i < size; i++) {
      pos = (int)str.find(pattern, i);
      if (pos < size) {
            std::string s = str.substr(i, static_cast<std::basic_string<char, std::char_traits<char>, std::allocator<char>>::size_type>(pos) - i);
            result.push_back(s);
            i = pos + (int)pattern.size() - 1;
      }
    }
    return result;
}

void MYTOOL::Wchar_tToString(std::string& szDst, wchar_t* wchar) {
    wchar_t* wText = wchar;
    DWORD dwNum = WideCharToMultiByte(CP_OEMCP, NULL, wText, -1, NULL, 0, NULL, FALSE);
    char* psText;
    psText = new char;
    WideCharToMultiByte(CP_ACP, NULL, wText, -1, psText, dwNum, NULL, FALSE);
    szDst = psText;
    delete[]psText;
}

wchar_t* MYTOOL::StringToWchar_t(std::string& str) {
    wchar_t* text1 = new wchar_t;
    swprintf(text1, str.size() + 1, L"%S ", str.c_str());
    return text1;
}

std::string MYTOOL::get_last_error(DWORD errCode) {
    std::string err("");
    if (errCode == 0) errCode = GetLastError();
    LPTSTR lpBuffer = NULL;
    //失败
    if (0 == FormatMessage(FORMAT_MESSAGE_ALLOCATE_BUFFER | FORMAT_MESSAGE_FROM_SYSTEM | FORMAT_MESSAGE_IGNORE_INSERTS,
      NULL, errCode, MAKELANGID(LANG_NEUTRAL, SUBLANG_DEFAULT), (LPTSTR)&lpBuffer, 0, NULL)) {
      char tmp = { 0 };
      sprintf_s(tmp, "{未定义错误描述(%d)}", errCode);
      err = tmp;
    } else {
      //成功
      USES_CONVERSION;
      if (lpBuffer != NULL) err = wstring2string(lpBuffer);
      LocalFree(lpBuffer);
    }
    return err;
}

int MYTOOL::IsUTF8(const void* pBuffer, long size) {
    int IsUTF8 = 1;
    unsigned char* start = (unsigned char*)pBuffer;
    unsigned char* end = (unsigned char*)pBuffer + size;
    while (start < end) {
      if (*start < 0x80) {
            start++;
      } else if (*start < (0xC0)) {
            IsUTF8 = 0; break;
      } else if (*start < (0xE0)) {
            if (start >= end - 1) break;
            if ((start & (0xC0)) != 0x80) { IsUTF8 = 0; break; }
            start += 2;
      } else if (*start < (0xF0)) {
            if (start >= end - 2) break;
            if ((start & (0xC0)) != 0x80 || (start & (0xC0)) != 0x80) { IsUTF8 = 0; break; }
            start += 3;
      } else { IsUTF8 = 0; break; }
    }
    return IsUTF8;
}

int MYTOOL::CalculateFileEncoding(LPCSTR filePath, std::string& str) {
    /*返回值说明
    * 0   文件读取失败
    * 1   UTF-8
    * 2   UTF-16LE
    * 3   UTF16_BE
    * 4   UTF8_BOM
    * 5   未知
    */
    HANDLE pFile; char* tmpBuf;
    DWORD fileSize, dwBytesRead, dwBytesToRead;
    pFile = CreateFileA(filePath, FILE_GENERIC_READ, FILE_SHARE_READ, NULL, OPEN_EXISTING, FILE_ATTRIBUTE_NORMAL, NULL);
    if (pFile == INVALID_HANDLE_VALUE) { CloseHandle(pFile); return 0; }
    fileSize = GetFileSize(pFile, NULL);
    char* buffer = new char[(size_t)fileSize + (size_t)1];
    //buffer = (char*)malloc(static_cast<size_t>(fileSize) + (size_t)1);
    //if (buffer == NULL) { return 0; }
    ZeroMemory(buffer, (size_t)fileSize + (size_t)1);
    dwBytesToRead = fileSize;
    dwBytesRead = 0;
    tmpBuf = buffer;
    do {
      BOOL success = ReadFile(pFile, tmpBuf, dwBytesToRead, &dwBytesRead, NULL);
      if (success == NULL) printf("ReadFile failed : %s", get_last_error().c_str());
      if (dwBytesRead == 0) break;
      dwBytesToRead -= dwBytesRead;
      tmpBuf += dwBytesRead;
    } while (dwBytesToRead > 0);
    CloseHandle(pFile);
    // 处理读到的数据 buffer
    //puts(buffer);
    //std::cout << "buffer0:" << (int)buffer << std::endl;
    //std::cout << "buffer1:" << (int)buffer << std::endl;
    //std::cout << "buffer2:" << (int)buffer << std::endl;
    str.clear();
    str += buffer;
    if (buffer == 0xFF && buffer == 0xFE) {
      return 2;//UTF16_LE
    } else if (buffer == -1 && buffer == -2) {
      return 2;//UTF16_LE
    } else if (buffer == 0xFE && buffer == 0xFF) {
      return 3;//UTF16_BE
    } else if (buffer == -2 && buffer == -1) {
      return 3;//UTF16_BE
    } else if (buffer == 0xEF && buffer == 0xBB && buffer == 0xBF) {
      return 4;//UTF8_BOM
    } else if (buffer == -17 && buffer == -69 && buffer == -65) {
      return 4;//UTF8_BOM
    } else if (IsUTF8(buffer, fileSize + 1)) {
      return 1;//UTF-8
    } else {
      return 5;//以上都不是,可能是ANSI
    }
}

int MYTOOL::CalculateFileEncodingW(LPCWSTR filePath, std::string& str) {
    /*返回值说明
    * 0   文件读取失败
    * 1   UTF-8
    * 2   UTF-16LE
    * 3   UTF16_BE
    * 4   UTF8_BOM
    * 5   未知
    */
    HANDLE pFile; char* tmpBuf;
    DWORD fileSize, dwBytesRead, dwBytesToRead;
    pFile = CreateFileW(filePath, FILE_GENERIC_READ, FILE_SHARE_READ, NULL, OPEN_EXISTING, FILE_ATTRIBUTE_NORMAL, NULL);
    if (pFile == INVALID_HANDLE_VALUE) { CloseHandle(pFile); return 0; }
    fileSize = GetFileSize(pFile, NULL);
    char* buffer = new char;
    buffer = (char*)malloc(static_cast<size_t>(fileSize) + (size_t)1);
    if (buffer == NULL) { return 0; }
    ZeroMemory(buffer, static_cast<size_t>(fileSize) + (size_t)1);
    dwBytesToRead = fileSize;
    dwBytesRead = 0;
    tmpBuf = buffer;
    do {
      BOOL success = ReadFile(pFile, tmpBuf, dwBytesToRead, &dwBytesRead, NULL);
      if (success == NULL) printf("ReadFile failed : %s", get_last_error().c_str());
      if (dwBytesRead == 0) break;
      dwBytesToRead -= dwBytesRead;
      tmpBuf += dwBytesRead;
    } while (dwBytesToRead > 0);
    CloseHandle(pFile);
    // 处理读到的数据 buffer
    //puts(buffer);
    //std::cout << "buffer0:" << (int)buffer << std::endl;
    //std::cout << "buffer1:" << (int)buffer << std::endl;
    //std::cout << "buffer2:" << (int)buffer << std::endl;
    str.clear();
    str += buffer;
    if (buffer == 0xFF && buffer == 0xFE) {
      return 2;//UTF16_LE
    } else if (buffer == -1 && buffer == -2) {//我自己调试了,FF是255,但是实际读到的是-1,所以才加了这么几行-1 -2 -17的
      return 2;//UTF16_LE
    } else if (buffer == 0xFE && buffer == 0xFF) {
      return 3;//UTF16_BE
    } else if (buffer == -2 && buffer == -1) {
      return 3;//UTF16_BE
    } else if (buffer == 0xEF && buffer == 0xBB && buffer == 0xBF) {
      return 4;//UTF8_BOM
    } else if (buffer == -17 && buffer == -69 && buffer == -65) {
      return 4;//UTF8_BOM
    } else if (IsUTF8(buffer, fileSize + 1)) {
      return 1;//UTF-8
    } else {
      return 5;//以上都不是,可能是ANSI
    }
}

std::string MYTOOL::判断文件编码(LPCSTR filePath, std::string& 读到的文本) {
    /*返回值说明
    * 0   文件读取失败
    * 1   UTF-8
    * 2   UTF-16LE
    * 3   UTF16_BE
    * 4   UTF8_BOM
    * 5   未知
    */
    int ret = CalculateFileEncoding(filePath, 读到的文本);
    switch (ret) {
    case 0:
      return "READ_FAIL";
      break;
    case 1:
      return "UTF-8";
      break;
    case 2:
      return "UTF-16LE_BOM";
      break;
    case 3:
      return "UTF16_BE_BOM";
      break;
    case 4:
      return "UTF8_BOM";
      break;
    case 5://以上都不是,可能是ANSI
      return "ANSI";
      break;
    default:
      return "ERROR";
      break;
    }
}

std::string MYTOOL::判断文件编码W(LPCWSTR filePath, std::string& 读到的文本) {
    /*返回值说明
    * 0   文件读取失败
    * 1   UTF-8
    * 2   UTF-16LE
    * 3   UTF16_BE
    * 4   UTF8_BOM
    * 5   未知
    */
    int ret = CalculateFileEncodingW(filePath, 读到的文本);
    switch (ret) {
    case 0:
      return "READ_FAIL";
      break;
    case 1:
      return "UTF-8";
      break;
    case 2:
      return "UTF-16LE_BOM";
      break;
    case 3:
      return "UTF16_BE_BOM";
      break;
    case 4:
      return "UTF8_BOM";
      break;
    case 5://以上都不是,可能是ANSI
      return "ANSI";
      break;
    default:
      return "ERROR";
      break;
    }
}

#endif

銀鈅 发表于 2023-6-3 12:34

谢谢楼主分享

Burpcka 发表于 2022-12-4 05:12

受教了感谢大佬

8970665 发表于 2022-12-4 09:39

收藏了感谢

sbwfnhn 发表于 2022-12-6 09:46

先收藏,以后转化成其它语言,可能用的上。{:1_918:}
页: [1]
查看完整版本: 判断文件编码(UTF8,8BOM,16LE,16BE,ANSI)附string互转wstring、String互转Wchar_...