This commit is contained in:
Mike 2025-05-15 07:23:45 +00:00
parent c3e8f84e3c
commit db703d6ed6

207
max/max-scraper.py Normal file
View File

@ -0,0 +1,207 @@
import os
import time
import random
import string
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
SEEN_LINKS_FILE = "seen_links.txt"
GENRE_URLS = [
"https://play.max.com/genre/action",
"https://play.max.com/genre/comedy",
"https://play.max.com/genre/drama",
"https://play.max.com/genre/horror",
"https://play.max.com/genre/sci-fi",
"https://play.max.com/genre/documentary"
]
def setup_browser():
options = Options()
options.add_argument("--disable-gpu")
options.add_argument("--window-size=1920,1080")
options.add_argument("--disable-blink-features=AutomationControlled")
return webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)
def scroll_to_bottom(driver, max_wait=60):
print("\U0001f4dc Scrolling to load all movies...")
last_height = driver.execute_script("return document.body.scrollHeight")
start_time = time.time()
while True:
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
time.sleep(2.5)
new_height = driver.execute_script("return document.body.scrollHeight")
if new_height == last_height:
print("\u2705 Reached bottom of page.")
break
last_height = new_height
if time.time() - start_time > max_wait:
print("\u23f1\ufe0f Scroll timed out.")
break
def extract_movie_links(driver):
anchors = driver.find_elements(By.XPATH, '//a[contains(@href, "/movie/")]')
links = list(set([a.get_attribute("href") for a in anchors if "/movie/" in a.get_attribute("href")]))
return links
def generate_random_query():
letters = string.ascii_lowercase
return ''.join(random.choices(letters, k=random.choice([1, 2])))
def load_seen_links():
if not os.path.exists(SEEN_LINKS_FILE):
return set()
with open(SEEN_LINKS_FILE, "r") as f:
return set([line.strip() for line in f if line.strip()])
def save_seen_links(new_links):
with open(SEEN_LINKS_FILE, "a") as f:
for link in new_links:
f.write(link + "\n")
def save_links(links, batch_id, prefix):
output_dir = "output_movies"
os.makedirs(output_dir, exist_ok=True)
for i in range(0, len(links), 10):
chunk = links[i:i+10]
file_path = os.path.join(output_dir, f"{prefix}_{batch_id}_{(i // 10) + 1}.txt")
with open(file_path, "w") as f:
for link in chunk:
f.write(link + "\n")
print(f"\U0001f4be Saved {len(chunk)} links to {file_path}")
def run_movies_mode(driver, seen_links, batch_counter):
driver.get("https://play.max.com/movies")
time.sleep(4)
scroll_to_bottom(driver)
all_links = extract_movie_links(driver)
print(f"\u2705 Loaded {len(all_links)} movie links.")
while True:
try:
count = int(input("\U0001f522 How many random movies do you want to extract? "))
break
except ValueError:
print("\u274c Invalid number.")
sample = random.sample(all_links, min(count, len(all_links)))
unique_links = [link for link in sample if link not in seen_links]
duplicates = len(sample) - len(unique_links)
if unique_links:
save_links(unique_links, batch_counter, "movies")
save_seen_links(unique_links)
print(f"\u2705 Saved {len(unique_links)} new links. \u267b Skipped {duplicates} duplicates.")
else:
print("\u26a0\ufe0f All selected links were duplicates.")
def run_search_mode(driver, seen_links, search_counter):
search_terms = [
"the", "man", "love", "dark", "moon", "fire", "red", "blue", "night",
"girl", "life", "death", "dream", "war", "blood", "star", "light", "king", "queen"
]
max_attempts = 10
found_total = 0
for attempt in range(max_attempts):
query = random.choice(search_terms)
url = f"https://play.max.com/search?q={query}"
print(f"\n\U0001f50d Searching for query: '{query}'")
driver.get(url)
time.sleep(4)
links = extract_movie_links(driver)
unique_links = [link for link in links if link not in seen_links]
duplicates = len(links) - len(unique_links)
if unique_links:
save_links(unique_links, search_counter, "search")
save_seen_links(unique_links)
print(f"\u2705 Found {len(unique_links)} new links. \u267b Skipped {duplicates} duplicates.")
search_counter += 1
found_total += len(unique_links)
else:
print(f"\u26a0\ufe0f No new links found for query '{query}'.")
print("\u23f3 Waiting 10 seconds before next search...")
time.sleep(10)
print(f"\n\ud83d\udcca Search complete. Found total {found_total} new movie links in {max_attempts} searches.")
def run_genre_mode(driver, seen_links, genre_batch):
found_total = 0
for genre_url in GENRE_URLS:
print(f"\n\U0001f3ac Loading genre: {genre_url}")
driver.get(genre_url)
time.sleep(4)
scroll_to_bottom(driver)
links = extract_movie_links(driver)
unique_links = [link for link in links if link not in seen_links]
duplicates = len(links) - len(unique_links)
if unique_links:
save_links(unique_links, genre_batch, "genre")
save_seen_links(unique_links)
print(f"\u2705 Found {len(unique_links)} new links. \u267b Skipped {duplicates} duplicates.")
genre_batch += 1
found_total += len(unique_links)
else:
print("\u26a0\ufe0f No new unique links in this genre.")
time.sleep(5)
print("📊 Genre scan complete. Found total", found_total, "new movie links.")
def main():
print("🛠️ Max.com Movie Scraper — Created by Mike | DRMLab.io Project")
driver = setup_browser()
try:
print("\U0001f310 Opening Max.com homepage...")
driver.get("https://play.max.com")
time.sleep(3)
print("\n\U0001f511 Please log in manually in the browser.")
input("\u23f3 After you're logged in, press ENTER here to continue...")
seen_links = load_seen_links()
movie_batch = 1
search_batch = 1
genre_batch = 1
while True:
print("📋 === MAIN MENU ===")
print("1⃣ Extract movies from /movies page")
print("2⃣ Extract random movies via search")
print("3⃣ Extract movies by genre")
print("4⃣ Exit")
choice = input("Enter your choice: ").strip()
if choice == "1":
run_movies_mode(driver, seen_links, movie_batch)
movie_batch += 1
elif choice == "2":
run_search_mode(driver, seen_links, search_batch)
search_batch += 1
elif choice == "3":
run_genre_mode(driver, seen_links, genre_batch)
genre_batch += 1
elif choice == "4":
break
else:
print("\u274c Invalid choice.")
finally:
again = input("⏹ Do you want to close the browser? (yes/no): ").strip().lower()
if again != "yes":
main()
else:
input("✅ Press ENTER to exit.")
driver.quit()
if __name__ == "__main__":
main()