64 lines
1.9 KiB
Python
64 lines
1.9 KiB
Python
import os
|
|
|
|
root_dir = '.'
|
|
skipped_dirs = ('.git', 'node_modules', 'unpackage', '.hbuilderx', '.vscode')
|
|
binary_exts = ('.png', '.jpg', '.jpeg', '.gif', '.ico', '.ttf', '.woff', '.woff2', '.pdf', '.zip', '.tar', '.gz', '.7z')
|
|
|
|
print(f"Scanning {os.path.abspath(root_dir)} recursively...")
|
|
print("Looking for non-UTF8 text files...")
|
|
|
|
non_utf8 = []
|
|
with_bom = []
|
|
|
|
for root, dirs, files in os.walk(root_dir):
|
|
if any(skip in root for skip in skipped_dirs):
|
|
continue
|
|
for file in files:
|
|
if any(file.lower().endswith(ext) for ext in binary_exts):
|
|
continue
|
|
path = os.path.join(root, file)
|
|
try:
|
|
with open(path, 'rb') as f:
|
|
data = f.read()
|
|
if not data: continue
|
|
|
|
# Check UTF-8 with BOM
|
|
if data.startswith(b'\xef\xbb\xbf'):
|
|
with_bom.append(path)
|
|
continue
|
|
|
|
# Try UTF-8
|
|
try:
|
|
data.decode('utf-8')
|
|
continue # Valid plain UTF-8
|
|
except UnicodeDecodeError:
|
|
# Try GBK
|
|
try:
|
|
data.decode('gbk')
|
|
non_utf8.append((path, "GBK"))
|
|
except UnicodeDecodeError:
|
|
# Try UTF-16
|
|
try:
|
|
data.decode('utf-16')
|
|
non_utf8.append((path, "UTF-16"))
|
|
except UnicodeDecodeError:
|
|
non_utf8.append((path, "Other"))
|
|
except:
|
|
pass
|
|
|
|
if non_utf8:
|
|
print(f"\nFound {len(non_utf8)} non-UTF-8 files:")
|
|
print("-" * 100)
|
|
for path, enc in non_utf8:
|
|
print(f"{enc: <10} | {path}")
|
|
else:
|
|
print("\nNo non-UTF-8 files found.")
|
|
|
|
if with_bom:
|
|
print(f"\nFound {len(with_bom)} UTF-8 files with BOM:")
|
|
print("-" * 100)
|
|
for path in with_bom:
|
|
print(f"UTF8-BOM | {path}")
|
|
|
|
print("\nScan complete.")
|